nn.recipe

#!/usr/bin/env  python
# -*- coding: utf-8 -*-

##
## Title:        Nasha Niva (NN.BY) news Recipe
## Contact:      wistful - <wst dot public dot mail at gmail dot com>'
##
## License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright:    wistful - <wst dot public dot mail at gmail dot com>'
##
## Written:      December 2011
## Last Edited:  2012-07-13
##

__license__     = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = '2012 wistful <wst dot public dot mail at gmail dot com>'

import re
from calibre.web.feeds.news import BasicNewsRecipe
from urlparse import urljoin


'''
http://nn.by/
'''


rm_tags = re.compile(r'<[^<]*?>')


def cleanup(soup_tag):
    return rm_tags.sub('', str(soup_tag))


class NN(BasicNewsRecipe):
    oldest_article          = 50
    max_articles_per_feed   = 100
    __author__              = 'wistful'
    masthead_url            = "http://nn.by/images/main/logo.png"
    title                   = u'Наша Ніва'
    description             = u'Наша Ніва - Першая беларуская газета'
    INDEX                   = 'http://nn.by/rss.php'
    main_url                = 'http://nn.by'
    publisher               = 'nn.by'
    publication_type        = 'newsportal'
    category                = 'news, Belarus'

    lang        = 'by'
    language    = 'by'
    direction   = 'ltr'

    remove_tags = [dict(attrs={'href': re.compile('.*_logo.png$', re.IGNORECASE)}),
                   dict(attrs={'src': re.compile('.*_logo.png$', re.IGNORECASE)}),
                   dict(name='noindex'), dict(name='object')]
    keep_only_tags = [dict(name='div', attrs={'class': re.compile(r'\bcopy-area\b')}),
                      dict(name='h1', attrs={'class': re.compile(r'\barticle-title\b')}),
                      dict(name='div', attrs={'class': re.compile(r'\barticle-info\b')}),
                      dict(name='div', attrs={'class': re.compile(r'\barticle-content\b')}),
                    ]

    no_stylesheets = True

    html2epub_options = 'publisher="%s"\ncomments="%s"\ntags="%s"' % (publisher, description, category)

    def parse_index(self):
        categories = [(cleanup(item), self.main_url + item['href']) for item in self.index_to_soup(self.main_url).find('div', attrs={'id': 'mainmenu'}).findAll('a')]
        print categories
        feeds = []
        for c_name, c_url in categories:
            doc = self.index_to_soup(c_url)
            content = doc.find('table', attrs={'id': 'content'}).find('td').find('div', attrs={'class': 'section'})
            articles = []
            for rec in content.findAll('li'):
                rec_title = cleanup(rec.find('h2').a)
                rec_url = rec.find('h2').a['href'].replace('c=ar', 'c=arprint')
                rec_descr = cleanup(rec.find('div', attrs={'class': re.compile(r'\bne-t1\b')}).p.find('a'))
                print rec_title, rec_url, rec_descr
                articles.append({'title': rec_title, 'url': rec_url, 'content': '', 'date': '', 'description': rec_descr})
            feeds.append((c_name, articles))
        return feeds