This repository has been archived by the owner on Jul 8, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
habr.recipe
98 lines (80 loc) · 3.79 KB
/
habr.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
# -*- coding: utf-8 -*-
##
## Title: Habrahabr hub Recipe
## Contact: wistful - <wst dot public dot mail at gmail dot com>'
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: wistful - <wst dot public dot mail at gmail dot com>'
##
## Written: December 2011
## Last Edited: 2012-07-13
##
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = '2012 wistful <wst dot public dot mail at gmail dot com>'
import re
from calibre.web.feeds.news import BasicNewsRecipe
'''
http://habrahabr.ru/hub/
'''
rm_tags = re.compile(r'<[^<]*?>')
def cleanup(soup_tag):
return rm_tags.sub('', str(soup_tag))
class Habr(BasicNewsRecipe):
hub = {'webdev': 'http://habrahabr.ru/hub/webdev/',
'javascript': 'http://habrahabr.ru/hub/javascript/',
'jQuery': 'http://habrahabr.ru/hub/jquery/',
'css': 'http://habrahabr.ru/hub/css/',
'python': 'http://habrahabr.ru/hub/python/',
'linux': 'http://habrahabr.ru/hub/linux/',
'Убунтариум': 'http://habrahabr.ru/hub/ubuntu/',
'development': 'http://habrahabr.ru/hub/development/',
'programming': 'http://habrahabr.ru/hub/programming/',
'Git': 'http://habrahabr.ru/hub/Git/',
'web design': 'http://habrahabr.ru/hub/web_design/',
'Управление проектами': 'http://habrahabr.ru/hub/pm/',
'Open Source': 'http://habrahabr.ru/hub/open_source/',
'Human Rresources': 'http://habrahabr.ru/hub/hr/'
}
masthead_url = 'http://upload.wikimedia.org/wikipedia/ru/7/7f/Habrahabr_logo.png'
oldest_article = 20
max_articles_per_feed = 20
title = u'Хабр'
__author__ = 'wistful'
description = u'Habrahabr hub'
publisher = 'habrahabr.ru'
publication_type = 'blog'
category = 'it, tech, news'
language = 'ru'
no_stylesheets = True
extra_css = """
body {font-size: 90%;}
.content h2 {margin: 0;}
pre, code, tt { font-size: 55%;
letter-spacing: -0.1pt;
}
.comment {font-size: 0.5em;}
"""
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="%s"\ncomments="%s"\ntags="%s"' % (publisher, description, category)
keep_only_tags = [dict(name='div', attrs={'class':re.compile(r'\bpost\b')}), dict(name='div', attrs={'class':re.compile(r'\bcomments_list\b')})]
def parse_index(self):
feeds = []
for blog_name, blog_url in self.hub.items():
doc = self.index_to_soup(blog_url)
articles = []
for post in doc.findAll('div', attrs={'class': re.compile(r'\bpost\b')}):
title_link = post.find('a', attrs={'class': re.compile(r'\bpost_title\b')})
title = cleanup(title_link)
description = post.find('div', attrs={'class': re.compile(r'\bcontent\b')})
if description:
[item.extract() for item in description.findAll('img')]
[item.extract() for item in description.findAll('div', attrs={'class': re.compile(r'\bhabracut\b')})]
articles.append({'title': title, 'url': title_link['href'], 'content': '', 'date': '', 'description': str(description)})
feeds.append((blog_name, articles))
return feeds