Improved recipe for Politico

2025-07-09 03:04:10 -04:00 · 2009-12-17 10:18:32 -07:00 · 2009-12-17 10:18:32 -07:00 · 1d0784f7c7
commit 1d0784f7c7
parent 3737a93493 3e3959eef4
8 changed files with 69 additions and 20 deletions
--- a/resources/recipes/politico.recipe
+++ b/resources/recipes/politico.recipe
@ -1,4 +1,5 @@
 #!/usr/bin/env  python
 # -*- coding: cp1252 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
@ -13,7 +14,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
 class Politico(BasicNewsRecipe):
    title                 = 'Politico'
-    __author__            = 'Darko Miletic'
+    __author__            = 'Darko Miletic and Sujata Raman'
    description           = 'Political news from USA'
    publisher             = 'Capitol News Company, LLC'
    category              = 'news, politics, USA'
@ -22,10 +23,9 @@ class Politico(BasicNewsRecipe):
    use_embedded_content  = False
    no_stylesheets        = True
    remove_javascript     = True
-    encoding              = 'cp1252'
+    encoding              = 'UTF-8'
    language = 'en'
    html2lrf_options = [
                          '--comment', description
                        , '--category', category
@ -35,10 +35,22 @@ class Politico(BasicNewsRecipe):
    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
-    remove_tags       = [dict(name=['notags','embed','object','link','img'])]
+    remove_tags       = [
                         dict(name=['notags','embed','object','link','img']),
                         ]
    extra_css = '''
                body{font-family:Arial,Sans-serif;}
                element.style{color:#FF0000;font-family:Arial,Sans-serif;}
                .author{color:#808080;font-size:x-small;}
                a{ color:#003399;}
                .byline{color:#696969 ; font-size:x-small;}
                .story{color:#000000;}
                td{color:#000000;}
                '''
    feeds = [
-               (u'Top Stories' , u'http://www.politico.com/rss/politicopicks.xml' )
+              (u'Top Stories' , u'http://www.politico.com/rss/politicopicks.xml' )
              ,(u'Congress'    , u'http://www.politico.com/rss/congress.xml'      )
              ,(u'Ideas'       , u'http://www.politico.com/rss/ideas.xml'         )
              ,(u'Life'        , u'http://www.politico.com/rss/life.xml'          )
@ -48,17 +60,23 @@ class Politico(BasicNewsRecipe):
              ,(u'Roger Simon' , u'http://www.politico.com/rss/rogersimon.xml'    )
              ,(u'Suite Talk'  , u'http://www.politico.com/rss/suitetalk.xml'     )
              ,(u'Playbook'    , u'http://www.politico.com/rss/playbook.xml'      )
-              ,(u'The Huddle'  , u'http://www.politico.com/rss/huddle.xml'        )
+              #(u'The Huddle'  , u'http://www.politico.com/rss/huddle.xml' )
            ]
    def preprocess_html(self, soup):
-        mtag = '<meta http-equiv="Content-Language" content="en-US"/>'
+        mtag = '<meta http-equiv="Content-Language" content="en"/>'
        soup.head.insert(0,mtag)
        for item in soup.findAll(style=True):
            del item['style']
        return soup
-    url_pat = re.compile(r'<a href="([^"]+printstory\.cfm[^"]+)"')
+    url_pat = re.compile(r'<a href="([^"]+print.*\.cfm[^"]+)"')
    def postprocess_html(self, soup, first):
            for tag in soup.findAll(name=['table', 'tr', 'td']):
                tag.name = 'div'
            return soup
    def print_version(self, url):
        raw = self.index_to_soup(url, raw=True)
--- a/resources/recipes/sueddeutsche.recipe
+++ b/resources/recipes/sueddeutsche.recipe
@ -26,9 +26,12 @@ class Sueddeutsche(BasicNewsRecipe):
                        dict(name='div', attrs={'id':["artikel","contentTable"]}) ,
                         ]
    remove_tags = [ dict(name='link'), dict(name='iframe'),
-                    dict(name='div', attrs={'id':["themenbox","artikelfoot","CAD_AD","rechteSpalte"]}),
+                    dict(name='div', attrs={'id':["themenbox","artikelfoot","CAD_AD","SKY_AD","NT1_AD","rechteSpalte"]}),
                    dict(name='div', attrs={'class':["similar-article-box","artikelliste","nteaser301bg","pages closed"]}),
                    dict(name='div', attrs={'class':["listHeader","listHeader2","hr2","item","videoBigButton"]}),
                    dict(name='p', attrs={'class':["ressortartikeln",]}),
                    dict(name='div', attrs={'style':["position:relative;"]}),
                    dict(name='span', attrs={'class':["nlinkheaderteaserschwarz",]}),
                    dict(name='table', attrs={'class':["kommentare","footer","pageBoxBot","pageAktiv","bgcontent"]}),
                    dict(name='ul', attrs={'class':["breadcrumb","articles","activities"]}),
                    dict(name='p', text = "ANZEIGE")
@ -66,3 +69,4 @@ class Sueddeutsche(BasicNewsRecipe):
--- a/src/calibre/devices/eb600/driver.py
+++ b/src/calibre/devices/eb600/driver.py
@ -121,5 +121,7 @@ class ITALICA(EB600):
    VENDOR_NAME = 'ITALICA'
    WINDOWS_MAIN_MEM = 'EREADER'
    WINDOWS_CARD_A_MEM = WINDOWS_MAIN_MEM
    OSX_MAIN_MEM = 'Italica eReader Media'
    OSX_CARD_A_MEM = OSX_MAIN_MEM
--- a/src/calibre/ebooks/init.py
+++ b/src/calibre/ebooks/init.py
@ -25,7 +25,7 @@ class DRMError(ValueError):
 BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'htm', 'xhtm',
                   'html', 'xhtml', 'pdf', 'pdb', 'prc', 'mobi', 'azw', 'doc',
                   'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'oebzip',
-                   'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1']
+                   'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml']
 class HTMLRenderer(object):
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -16,10 +16,16 @@ class MOBIInput(InputFormatPlugin):
                accelerators):
        from calibre.ebooks.mobi.reader import MobiReader
        from lxml import html
        mr = MobiReader(stream, log, options.input_encoding,
                        options.debug_pipeline)
        parse_cache = {}
-        mr.extract_content('.', parse_cache)
+        try:
            mr = MobiReader(stream, log, options.input_encoding,
                        options.debug_pipeline)
            mr.extract_content('.', parse_cache)
        except:
            mr = MobiReader(stream, log, options.input_encoding,
                        options.debug_pipeline, try_extra_data_fix=True)
            mr.extract_content('.', parse_cache)
        raw = parse_cache.pop('calibre_raw_mobi_markup', False)
        if raw:
            if isinstance(raw, unicode):
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -108,7 +108,7 @@ class EXTHHeader(object):
 class BookHeader(object):
-    def __init__(self, raw, ident, user_encoding, log):
+    def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
        self.log = log
        self.compression_type = raw[:2]
        self.records, self.records_size = struct.unpack('>HH', raw[8:12])
@ -141,7 +141,8 @@ class BookHeader(object):
                self.codec = 'cp1252' if user_encoding is None else user_encoding
                log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
                    self.codec))
-            if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
+            if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length \
                or (try_extra_data_fix and self.length == 0xE4):
                self.extra_flags = 0
            else:
                self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
@ -229,7 +230,8 @@ class MobiReader(object):
    PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
    IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
-    def __init__(self, filename_or_stream, log, user_encoding=None, debug=None):
+    def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
            try_extra_data_fix=False):
        self.log = log
        self.debug = debug
        self.embedded_mi = None
@ -284,7 +286,7 @@ class MobiReader(object):
        self.book_header = BookHeader(self.sections[0][0], self.ident,
-            user_encoding, self.log)
+            user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
        self.name = self.name.decode(self.book_header.codec, 'replace')
    def extract_content(self, output_dir, parse_cache):
--- a/src/calibre/gui2/viewer/documentview.py
+++ b/src/calibre/gui2/viewer/documentview.py
@ -587,20 +587,32 @@ class DocumentView(QWebView):
            if self.manager is not None:
                self.manager.next_document()
        else:
            oopos = self.document.ypos
            #print '\nOriginal position:', oopos
            self.document.set_bottom_padding(0)
            opos = self.document.ypos
            #print 'After set padding=0:', self.document.ypos
            if opos < oopos:
                if self.manager is not None:
                    self.manager.next_document()
                return
            lower_limit = opos + delta_y # Max value of top y co-ord after scrolling
            max_y = self.document.height - window_height # The maximum possible top y co-ord
            if max_y < lower_limit:
                #print 'Setting padding to:', lower_limit - max_y
                self.document.set_bottom_padding(lower_limit - max_y)
            max_y = self.document.height - window_height
            lower_limit = min(max_y, lower_limit)
            #print 'Scroll to:', lower_limit
            if lower_limit > opos:
                self.document.scroll_to(self.document.xpos, lower_limit)
            actually_scrolled = self.document.ypos - opos
            #print 'After scroll pos:', self.document.ypos
            self.find_next_blank_line(window_height - actually_scrolled)
            #print 'After blank line pos:', self.document.ypos
            if self.manager is not None:
                self.manager.scrolled(self.scroll_fraction)
            #print 'After all:', self.document.ypos
    def scroll_by(self, x=0, y=0, notify=True):
        old_pos = self.document.ypos
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -124,6 +124,11 @@ If you do need to reset your metadata due to problems caused by using both
 at the same time, then just delete the media.xml file on the Reader using
 your PC's file explorer and it will be recreated after disconnection.
 With recent reader iterations, SONY, in all its wisdom has decided to try to force you to
 use their software. If you install it, it auto-launches whenever you connect the reader.
 If you don't want to uninstall it altogether, there are a couple of tricks you can use. The
 simplest is to simply re-name the executable file that launches the library program. More detail
 `here http://www.mobileread.com/forums/showthread.php?t=65809`_.
 Can I use the collections feature of the SONY reader?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~