diff --git a/src/calibre/ebooks/lrf/meta.py b/src/calibre/ebooks/lrf/meta.py index cac7ea9cc2..6b5e296d14 100644 --- a/src/calibre/ebooks/lrf/meta.py +++ b/src/calibre/ebooks/lrf/meta.py @@ -627,7 +627,6 @@ def set_metadata(stream, mi): def main(args=sys.argv): - import os.path parser = option_parser() options, args = parser.parse_args(args) if len(args) != 2: diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 6cc283be95..3d4ec4a72d 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -124,7 +124,8 @@ class MobiReader(object): PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) - def __init__(self, filename_or_stream): + def __init__(self, filename_or_stream, verbose=False): + self.verbose = verbose if hasattr(filename_or_stream, 'read'): stream = filename_or_stream stream.seek(0) @@ -189,6 +190,8 @@ class MobiReader(object): '\n', self.processed_html) + if self.verbose: + print 'Parsing HTML...' soup = BeautifulSoup(self.processed_html) self.cleanup_soup(soup) guide = soup.find('guide') @@ -212,6 +215,8 @@ class MobiReader(object): open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) def cleanup_html(self): + if self.verbose: + print 'Cleaning up HTML...' self.processed_html = re.sub(r'
', '', self.processed_html) if self.book_header.ancient and '')+'' @@ -221,6 +226,8 @@ class MobiReader(object): self.processed_html = re.sub(r'(?i)'%t, r'', self.processed_html) def cleanup_soup(self, soup): + if self.verbose: + print 'Replacing height, width and align attributes' for tag in soup.recursiveChildGenerator(): if not isinstance(tag, Tag): continue styles = [] @@ -311,6 +318,8 @@ class MobiReader(object): return data[:len(data)-trail_size] def extract_text(self): + if self.verbose: + print 'Extracting text...' text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)] processed_records = list(range(0, self.book_header.records+1)) @@ -343,6 +352,8 @@ class MobiReader(object): self.processed_html) def add_anchors(self): + if self.verbose: + print 'Adding anchors...' positions = set([]) link_pattern = re.compile(r'<[^<>]+filepos=[\'"]{0,1}(\d+)[^<>]*>', re.IGNORECASE) for match in link_pattern.finditer(self.mobi_html): @@ -370,6 +381,8 @@ class MobiReader(object): self.processed_html) def extract_images(self, processed_records, output_dir): + if self.verbose: + print 'Extracting images...' output_dir = os.path.abspath(os.path.join(output_dir, 'images')) if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -438,7 +451,7 @@ def main(args=sys.argv): parser.print_help() return 1 - mr = MobiReader(args[1]) + mr = MobiReader(args[1], verbose=opts.verbose) opts.output_dir = os.path.abspath(opts.output_dir) mr.extract_content(opts.output_dir) if opts.verbose: diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index c50f93d7ef..1462c688c1 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -11,7 +11,7 @@ recipes = [ 'ars_technica', 'upi', 'new_yorker', 'irish_times', 'iht', 'lanacion', 'discover_magazine', 'scientific_american', 'new_york_review_of_books', 'daily_telegraph', 'guardian', 'el_pais', 'new_scientist', 'b92', - 'politika', 'moscow_times' + 'politika', 'moscow_times', 'latimes' ] import re, imp, inspect, time, os diff --git a/src/calibre/web/feeds/recipes/latimes.py b/src/calibre/web/feeds/recipes/latimes.py new file mode 100644 index 0000000000..71bbc14068 --- /dev/null +++ b/src/calibre/web/feeds/recipes/latimes.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2008, Darko Miletic ' +''' +latimes.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class LATimes(BasicNewsRecipe): + title = u'The Los Angeles Times' + __author__ = u'Darko Miletic' + description = u'News from Los Angeles' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + + keep_only_tags = [ dict(name='div', attrs={'id':'center' }) ] + remove_tags_after = [ dict(name='div', attrs={'id':'socialnet'}) ] + remove_tags = [ + dict(name='div' , attrs={'id':'wrapper_vid' }) + ,dict(name='div' , attrs={'id':'article_related'}) + ,dict(name='div' , attrs={'id':'socialnet' }) + ] + + feeds = [(u'News', u'http://feeds.latimes.com/latimes/news')] \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/moscow_times.py b/src/calibre/web/feeds/recipes/moscow_times.py new file mode 100644 index 0000000000..6c4d249fad --- /dev/null +++ b/src/calibre/web/feeds/recipes/moscow_times.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2008, Darko Miletic ' +''' +moscowtimes.ru +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Moscowtimes(BasicNewsRecipe): + title = u'The Moscow Times' + __author__ = 'Darko Miletic' + description = 'News from Russia' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + + feeds = [ + (u'The Moscow Times' , u'http://www.themoscowtimes.com/rss.xml' ) + ] + + def print_version(self, url): + return url + '&print=Y' \ No newline at end of file