This repository has been archived by the owner on Jul 8, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
tutby.recipe
82 lines (66 loc) · 2.73 KB
/
tutby.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# -*- coding: utf-8 -*-
##
## Title: TUT.BY news Recipe
## Contact: wistful - <wst dot public dot mail at gmail dot com>'
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: wistful - <wst dot public dot mail at gmail dot com>'
##
## Written: December 2011
## Last Edited: 2012-05-22
##
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = '2012 wistful <wst dot public dot mail at gmail dot com>'
from calibre.web.feeds.news import BasicNewsRecipe
from urlparse import urljoin
import re
'''
http://tut.by/
'''
rm_tags = re.compile(r'<[^<]*?>')
def cleanup(soup_tag):
return rm_tags.sub('', str(soup_tag))
class TutBy(BasicNewsRecipe):
oldest_article = 20
max_articles_per_feed = 100
masthead_url = 'http://img.tyt.by/titul_by2/logo_ru2.gif'
title = 'TUT.BY'
__author__ = 'wistful'
description = u'НОВОСТИ ПОРТАЛА TUT.BY'
publisher = 'TUT.BY'
publication_type = 'newsportal'
category = 'news, Belarus'
INDEX = 'http://m.tut.by/'
language = 'ru'
lang = 'ru'
direction = 'ltr'
keep_only_tags = [dict(name='div', attrs={'class': re.compile("(^|.*\s+)article(\s+.*|$)", re.IGNORECASE)})]
remove_tags = [ dict(name='object'),
dict(name='div', attrs={'id': re.compile('^swf.*', re.IGNORECASE)}),
dict(name='a', attrs={'href': re.compile('upload', re.IGNORECASE)})]
no_stylesheets = True
extra_css = """
.date, .rubric, .authorContainer {font-size: 0.6em;}
ul.author {margin: 5px 0;}
"""
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="%s"\ncomments="%s"\ntags="%s"' % (publisher, description, category)
def parse_index(self):
doc = self.index_to_soup(self.INDEX)
feeds = []
for category in doc.findAll('dl'):
category_name = cleanup(category.find('h2'))
articles = []
for post in category.findAll('dd'):
more_link = post.find('a')
if more_link:
link = urljoin(self.INDEX, more_link['href']).replace('m.tut.by/news/', 'news.tut.by/')
title = cleanup(more_link)
articles.append({'title': title, 'url': link, 'content': '', 'date': '', 'description': ''})
feeds.append((category_name, articles))
return feeds