This repository has been archived by the owner on Jul 8, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
belapan.recipe
95 lines (74 loc) · 3.05 KB
/
belapan.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python
# -*- coding: utf-8 -*-
##
## Title: Belapan News
## Contact: wistful - <wst dot public dot mail at gmail dot com>'
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: wistful - <wst dot public dot mail at gmail dot com>'
##
## Written: December 2011
## Last Edited: 2012-05-22
##
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = '2012 wistful <wst dot public dot mail at gmail dot com>'
from urlparse import urljoin
import re
from calibre.web.feeds.news import BasicNewsRecipe
'''
http://belapan.by/
'''
rm_tags = re.compile(r'<[^<]*?>')
def cleanup(soup_tag):
return rm_tags.sub('', str(soup_tag))
class Belapan(BasicNewsRecipe):
oldest_article = 3
max_articles_per_feed = 100
masthead_url = 'http://belapan.by/webroot/delivery/pic/logo.gif'
title = u'БЕЛАПАН'
__author__ = 'wistful'
description = u'Белорусская информационная компания'
publisher = 'belapan.by'
publication_type = 'newsportal'
category = 'news'
INDEX = 'http://belapan.by/'
language = 'ru'
lang = 'ru'
direction = 'ltr'
categories = {
u'Политика': 'http://belapan.by/news/politics/',
u'Экономика': 'http://belapan.by/news/economy/',
u'Финансы': 'http://belapan.by/news/finances/',
u'Общество': 'http://belapan.by/news/society/',
u'Культура': 'http://belapan.by/news/culture/',
u'Спорт': 'http://belapan.by/news/sport/',
}
keep_only_tags = [dict(attrs={'class': 'news-title'}),
dict(name='div', attrs={'class': 'wysiwygContent'}),
dict(name='div', attrs={'class': 'auth-line'})]
no_stylesheets = True
extra_css = """
body, body.calibre {font-size: 11pt;}
p {margin: 0.5em 0;}
"""
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="%s"\ncomments="%s"\ntags="%s"' % (publisher, description, category)
def parse_index(self):
feeds = []
for category_name, category_tag in self.categories.items():
doc = self.index_to_soup(category_tag)
articles = []
for item in doc.find(attrs={'id': 'docsslot'}).findAll('li'):
if item.find('img', attrs={'class': 'key'}):
continue
more_link = item.a
if more_link:
title = cleanup(more_link)
link = urljoin(self.INDEX, more_link['href'])
articles.append({'title': title, 'url': link, 'content': '', 'date': '', 'description': ''})
feeds.append((category_name, articles))
return feeds