habr.recipe

#!/usr/bin/env  python
# -*- coding: utf-8 -*-

##
## Title:        Habrahabr hub Recipe
## Contact:      wistful - <wst dot public dot mail at gmail dot com>'
##
## License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright:    wistful - <wst dot public dot mail at gmail dot com>'
##
## Written:      December 2011
## Last Edited:  2012-07-13
##

__license__     = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = '2012 wistful <wst dot public dot mail at gmail dot com>'

import re
from calibre.web.feeds.news import BasicNewsRecipe


'''
http://habrahabr.ru/hub/
'''


rm_tags = re.compile(r'<[^<]*?>')


def cleanup(soup_tag):
    return rm_tags.sub('', str(soup_tag))


class Habr(BasicNewsRecipe):
    hub = {'webdev':                'http://habrahabr.ru/hub/webdev/',
           'javascript':            'http://habrahabr.ru/hub/javascript/',
           'jQuery':                'http://habrahabr.ru/hub/jquery/',
           'css':                   'http://habrahabr.ru/hub/css/',
           'python':                'http://habrahabr.ru/hub/python/',
           'linux':                 'http://habrahabr.ru/hub/linux/',
           'Убунтариум':            'http://habrahabr.ru/hub/ubuntu/',
           'development':           'http://habrahabr.ru/hub/development/',
           'programming':           'http://habrahabr.ru/hub/programming/',
           'Git':                   'http://habrahabr.ru/hub/Git/',
           'web design':            'http://habrahabr.ru/hub/web_design/',
           'Управление проектами':  'http://habrahabr.ru/hub/pm/',
           'Open Source':           'http://habrahabr.ru/hub/open_source/',
           'Human Rresources':      'http://habrahabr.ru/hub/hr/'
    }

    masthead_url            = 'http://upload.wikimedia.org/wikipedia/ru/7/7f/Habrahabr_logo.png'

    oldest_article          = 20
    max_articles_per_feed   = 20
    title                   = u'Хабр'
    __author__              = 'wistful'
    description             = u'Habrahabr hub'
    publisher               = 'habrahabr.ru'
    publication_type        = 'blog'
    category                = 'it, tech, news'
    language                = 'ru'

    no_stylesheets = True
    extra_css = """
        body {font-size: 90%;}
        .content h2 {margin: 0;}
        pre, code, tt { font-size: 55%;
                    letter-spacing: -0.1pt;
                  }
        .comment {font-size: 0.5em;}
        """
    direction             = 'ltr'

    html2lrf_options = [
        '--comment'  , description
        , '--category' , category
        , '--publisher', publisher
    ]

    html2epub_options = 'publisher="%s"\ncomments="%s"\ntags="%s"' % (publisher, description, category)

    keep_only_tags = [dict(name='div', attrs={'class':re.compile(r'\bpost\b')}), dict(name='div', attrs={'class':re.compile(r'\bcomments_list\b')})]

    def parse_index(self):
        feeds = []
        for blog_name, blog_url in self.hub.items():
            doc = self.index_to_soup(blog_url)
            articles = []
            for post in doc.findAll('div', attrs={'class': re.compile(r'\bpost\b')}):
                title_link = post.find('a', attrs={'class': re.compile(r'\bpost_title\b')})
                title = cleanup(title_link)
                description = post.find('div', attrs={'class': re.compile(r'\bcontent\b')})
                if description:
                    [item.extract() for item in description.findAll('img')]
                    [item.extract() for item in description.findAll('div', attrs={'class': re.compile(r'\bhabracut\b')})]
                articles.append({'title': title, 'url': title_link['href'], 'content': '', 'date': '', 'description': str(description)})
            feeds.append((blog_name, articles))
        return feeds