This repository has been archived by the owner on Jul 8, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
nn.recipe
80 lines (64 loc) · 3.09 KB
/
nn.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
# -*- coding: utf-8 -*-
##
## Title: Nasha Niva (NN.BY) news Recipe
## Contact: wistful - <wst dot public dot mail at gmail dot com>'
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: wistful - <wst dot public dot mail at gmail dot com>'
##
## Written: December 2011
## Last Edited: 2012-07-13
##
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = '2012 wistful <wst dot public dot mail at gmail dot com>'
import re
from calibre.web.feeds.news import BasicNewsRecipe
from urlparse import urljoin
'''
http://nn.by/
'''
rm_tags = re.compile(r'<[^<]*?>')
def cleanup(soup_tag):
return rm_tags.sub('', str(soup_tag))
class NN(BasicNewsRecipe):
oldest_article = 50
max_articles_per_feed = 100
__author__ = 'wistful'
masthead_url = "http://nn.by/images/main/logo.png"
title = u'Наша Ніва'
description = u'Наша Ніва - Першая беларуская газета'
INDEX = 'http://nn.by/rss.php'
main_url = 'http://nn.by'
publisher = 'nn.by'
publication_type = 'newsportal'
category = 'news, Belarus'
lang = 'by'
language = 'by'
direction = 'ltr'
remove_tags = [dict(attrs={'href': re.compile('.*_logo.png$', re.IGNORECASE)}),
dict(attrs={'src': re.compile('.*_logo.png$', re.IGNORECASE)}),
dict(name='noindex'), dict(name='object')]
keep_only_tags = [dict(name='div', attrs={'class': re.compile(r'\bcopy-area\b')}),
dict(name='h1', attrs={'class': re.compile(r'\barticle-title\b')}),
dict(name='div', attrs={'class': re.compile(r'\barticle-info\b')}),
dict(name='div', attrs={'class': re.compile(r'\barticle-content\b')}),
]
no_stylesheets = True
html2epub_options = 'publisher="%s"\ncomments="%s"\ntags="%s"' % (publisher, description, category)
def parse_index(self):
categories = [(cleanup(item), self.main_url + item['href']) for item in self.index_to_soup(self.main_url).find('div', attrs={'id': 'mainmenu'}).findAll('a')]
print categories
feeds = []
for c_name, c_url in categories:
doc = self.index_to_soup(c_url)
content = doc.find('table', attrs={'id': 'content'}).find('td').find('div', attrs={'class': 'section'})
articles = []
for rec in content.findAll('li'):
rec_title = cleanup(rec.find('h2').a)
rec_url = rec.find('h2').a['href'].replace('c=ar', 'c=arprint')
rec_descr = cleanup(rec.find('div', attrs={'class': re.compile(r'\bne-t1\b')}).p.find('a'))
print rec_title, rec_url, rec_descr
articles.append({'title': rec_title, 'url': rec_url, 'content': '', 'date': '', 'description': rec_descr})
feeds.append((c_name, articles))
return feeds