Recipe for Los Angeles Times (thanks to Darko Miletic)

2025-07-09 03:04:10 -04:00 · 2008-11-12 12:47:49 -08:00 · 2008-11-12 12:47:49 -08:00 · 69671e1386
commit 69671e1386
parent f56e8739cb
5 changed files with 69 additions and 4 deletions
--- a/src/calibre/ebooks/lrf/meta.py
+++ b/src/calibre/ebooks/lrf/meta.py
@ -627,7 +627,6 @@ def set_metadata(stream, mi):
 def main(args=sys.argv):
    import os.path
    parser = option_parser()
    options, args = parser.parse_args(args)
    if len(args) != 2:
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -124,7 +124,8 @@ class MobiReader(object):
    PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
-    def __init__(self, filename_or_stream):
+    def __init__(self, filename_or_stream, verbose=False):
        self.verbose = verbose
        if hasattr(filename_or_stream, 'read'):
            stream = filename_or_stream
            stream.seek(0)
@ -189,6 +190,8 @@ class MobiReader(object):
                '</style>\n',
                self.processed_html)
        if self.verbose:
            print 'Parsing HTML...'
        soup = BeautifulSoup(self.processed_html)
        self.cleanup_soup(soup)
        guide = soup.find('guide')
@ -212,6 +215,8 @@ class MobiReader(object):
                open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
    def cleanup_html(self):
        if self.verbose:
            print 'Cleaning up HTML...'
        self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
        if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
            self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
@ -221,6 +226,8 @@ class MobiReader(object):
            self.processed_html = re.sub(r'(?i)</%s>'%t, r'</span>', self.processed_html)
    def cleanup_soup(self, soup):
        if self.verbose:
            print 'Replacing height, width and align attributes'
        for tag in soup.recursiveChildGenerator():
            if not isinstance(tag, Tag): continue
            styles = []
@ -311,6 +318,8 @@ class MobiReader(object):
        return data[:len(data)-trail_size]
    def extract_text(self):
        if self.verbose:
            print 'Extracting text...'
        text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
        processed_records = list(range(0, self.book_header.records+1))
@ -343,6 +352,8 @@ class MobiReader(object):
                                                      self.processed_html)
    def add_anchors(self):
        if self.verbose:
            print 'Adding anchors...'
        positions = set([])
        link_pattern = re.compile(r'<[^<>]+filepos=[\'"]{0,1}(\d+)[^<>]*>', re.IGNORECASE)
        for match in link_pattern.finditer(self.mobi_html):
@ -370,6 +381,8 @@ class MobiReader(object):
                                               self.processed_html)
    def extract_images(self, processed_records, output_dir):
        if self.verbose:
            print 'Extracting images...'
        output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
@ -438,7 +451,7 @@ def main(args=sys.argv):
        parser.print_help()
        return 1
-    mr = MobiReader(args[1])
+    mr = MobiReader(args[1], verbose=opts.verbose)
    opts.output_dir = os.path.abspath(opts.output_dir)
    mr.extract_content(opts.output_dir)
    if opts.verbose:
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -11,7 +11,7 @@ recipes = [
           'ars_technica', 'upi', 'new_yorker', 'irish_times', 'iht', 'lanacion',
           'discover_magazine', 'scientific_american', 'new_york_review_of_books',
           'daily_telegraph', 'guardian', 'el_pais', 'new_scientist', 'b92', 
-           'politika', 'moscow_times'
+           'politika', 'moscow_times', 'latimes'
          ]
 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/latimes.py
+++ b/src/calibre/web/feeds/recipes/latimes.py
@ -0,0 +1,28 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
 '''
 latimes.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class LATimes(BasicNewsRecipe):
    title                 = u'The Los Angeles Times'
    __author__            = u'Darko Miletic'
    description           = u'News from Los Angeles'    
    oldest_article        = 7
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    keep_only_tags    = [ dict(name='div', attrs={'id':'center'   }) ]
    remove_tags_after = [ dict(name='div', attrs={'id':'socialnet'}) ]
    remove_tags       = [
                           dict(name='div' , attrs={'id':'wrapper_vid'    })
                          ,dict(name='div' , attrs={'id':'article_related'})
                          ,dict(name='div' , attrs={'id':'socialnet'      })
                        ]
    feeds          = [(u'News', u'http://feeds.latimes.com/latimes/news')]
--- a/src/calibre/web/feeds/recipes/moscow_times.py
+++ b/src/calibre/web/feeds/recipes/moscow_times.py
@ -0,0 +1,25 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
 '''
 moscowtimes.ru
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class Moscowtimes(BasicNewsRecipe):
    title                 = u'The Moscow Times'
    __author__            = 'Darko Miletic'
    description           = 'News from Russia'    
    oldest_article        = 7
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    feeds          = [
                        (u'The Moscow Times'     , u'http://www.themoscowtimes.com/rss.xml'     )
                     ]
    def print_version(self, url):
        return url + '&print=Y'