Added --debug-pipeline switch to EPUB/MOBI

2025-07-09 03:04:10 -04:00 · 2010-01-23 10:18:57 -07:00 · 2010-01-23 10:18:57 -07:00 · 958579d3dc
commit 958579d3dc
parent f525ec57d6 ac3041d044
10 changed files with 134 additions and 15 deletions
--- a/resources/images/news/greensboro_news_and_record.png
+++ b/resources/images/news/greensboro_news_and_record.png
--- a/resources/images/news/hotair.png
+++ b/resources/images/news/hotair.png
--- a/resources/recipes/greensboro_news_and_record.recipe
+++ b/resources/recipes/greensboro_news_and_record.recipe
@ -0,0 +1,54 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Walt Anthony <workshop.northpole at gmail.com>'
+'''
+www.news-record.com
+'''
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class NewsandRecord(BasicNewsRecipe):
+    title          = u'Greensboro News & Record'
+    description    = "News from Greensboro, North Carolina"
+    __author__     = 'Walt Anthony'
+    publisher             = 'News & Record and Landmark Media Enterprises, LLC'
+    category              = 'news, USA'
+    oldest_article        = 3 #days
+    max_articles_per_feed = 25
+    summary_length        = 150
+    language              = 'en'
+    encoding              = 'utf-8'
+    remove_javascript     = True
+    no_stylesheets        = True
+
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+
+
+    remove_tags_before = dict(name='h3', attrs={'class':'nrcTxt_headline'})
+    remove_tags_after  = dict(name='div', attrs={'id':'nrcBlk_ContentBody'})
+
+    remove_tags = [
+       dict(name='iframe'),
+       dict(name=['notags','embed','object','link','img']),
+
+    ]
+
+
+    feeds = [
+        ('News', 'http://www.news-record.com/news/archive/feed'),
+        ('Greensboro News', 'http://www.news-record.com/news/greensboro/feed'),
+        ('Education', 'http://www.news-record.com/news/education/feed'),
+        ('Government', 'http://www.news-record.com/news/government/feed'),
+        ('College Sports', 'http://www.news-record.com/sports/college/feed'),
+        ('Sports Extra', 'http://www.news-record.com/blog/sportsextra/feed'),
+        ('Life', 'http://www.news-record.com/life/top/feed'),
+        ('NASCAR', 'http://www.news-record.com/sports/nascar/top/feed'),
+        ('Editorials', 'http://www.news-record.com/opinion/editorials/feed'),
+        ('Letters to the Editor', 'http://www.news-record.com/opinion/letters/feed')
+    ]
+
--- a/resources/recipes/hotair.recipe
+++ b/resources/recipes/hotair.recipe
@ -0,0 +1,41 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Walt Anthony <workshop.northpole at gmail.com>'
+'''
+www.hotair.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class hotair(BasicNewsRecipe):
+    title          = u'Hot Air'
+    __author__            = 'Walt Anthony'
+    description           = "The world's first, full-service conservative Internet broadcast network"
+    publisher             = 'Hot Air'
+    category              = 'news, politics, USA'
+    oldest_article = 3
+    max_articles_per_feed = 100
+    summary_length = 150
+    language              = 'en'
+    encoding              = 'utf-8'
+    use_embedded_content  = False
+    remove_javascript = True
+
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+
+
+
+    keep_only_tags = [dict(name='div', attrs={'id':'page-post'})]
+
+    remove_tags  = [dict(name=['iframe', 'small', 'embed', 'object','link','script','form'])]
+
+    feeds = [
+        ('Hot Air', 'http://feeds.feedburner.com/hotair/main'),
+        ('The Greenroom', 'http://feeds2.feedburner.com/hotair/greenroom')
+    ]
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@ -146,12 +146,14 @@ class Region(object):
        self.columns = []
        self.top = self.bottom = self.left = self.right = self.width = self.height = 0

-    def add_columns(self, columns):
+    def add(self, columns):
        if not self.columns:
            for x in sorted(columns, cmp=lambda x,y: cmp(x.left, y.left)):
                self.columns.append(x)
        else:
-           pass
+            for i in range(len(columns)):
+                for elem in columns[i]:
+                    self.columns[i].add(elem)

    def contains(self, columns):
        if not self.columns:
@ -168,6 +170,11 @@ class Region(object):
                return False
        return True

+    @property
+    def is_empty(self):
+        return len(self.elements) == 0
+
+
 class Page(object):

    # Fraction of a character width that two strings have to be apart,
@ -242,19 +249,25 @@ class Page(object):
                self.texts.remove(match)

    def first_pass(self):
+        'Sort page into regions and columns'
        self.regions = []
        if not self.elements:
            return
        for i, x in enumerate(self.elements):
            x.idx = i
-        self.current_region = None
+        current_region = Region()
        processed = set([])
        for x in self.elements:
            if x in processed: continue
            elems = set(self.find_elements_in_row_of(x))
            columns = self.sort_into_columns(x, elems)
            processed.update(elems)
-            columns
+            if not current_region.contains(columns):
+                self.regions.append(self.current_region)
+                current_region = Region()
+            current_region.add(columns)
+        if not self.current_region.is_empty():
+            self.regions.append(current_region)

    def sort_into_columns(self, elem, neighbors):
        columns = [Column()]
--- a/src/calibre/gui2/catalog/catalog_epub_mobi.py
+++ b/src/calibre/gui2/catalog/catalog_epub_mobi.py
@ -17,7 +17,7 @@ class PluginWidget(QWidget,Ui_Form):
    TITLE = _('E-book Options')
    HELP  = _('Options specific to')+' EPUB/MOBI '+_('output')
    OPTION_FIELDS = [('exclude_genre','\[[\w ]*\]'),
-                     ('exclude_tags','~,Catalog'),
+                     ('exclude_tags','~,'+_('Catalog')),
                     ('read_tag','+'),
                     ('note_tag','*')]

--- a/src/calibre/gui2/device.py
+++ b/src/calibre/gui2/device.py
@ -932,7 +932,7 @@ class DeviceGUI(object):
            if isinstance(job.exception, FreeSpaceError):
                where = 'in main memory.' if 'memory' in str(job.exception) \
                        else 'on the storage card.'
-                titles = '\n'.join(['<li>'+mi['title']+'</li>' \
+                titles = '\n'.join(['<li>'+mi.title+'</li>' \
                                    for mi in metadata])
                d = error_dialog(self, _('No space on device'),
                                 _('<p>Cannot upload books to device there '
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@ -241,13 +241,22 @@ class EPUB_MOBI(CatalogPlugin):
                          help = _('Title of generated catalog used as title in metadata.\n'
                          "Default: '%default'\n"
                          "Applies to: ePub, MOBI output formats")),
+                   Option('--debug-pipeline',
+                           default=None,
+                           dest='debug_pipeline',                
+                           help=_('Save the output from different stages of the conversion '
+                           'pipeline to the specified '
+                           'directory. Useful if you are unsure at which stage '
+                           'of the conversion process a bug is occurring.\n'
+                           'Default: None\n'
+                           'Applies to: ePub, MOBI output formats')),    
                   Option('--exclude-genre',
                          default='\[[\w ]*\]',
                          dest='exclude_genre',
                          help=_("Regex describing tags to exclude as genres.\n" "Default: '%default' excludes bracketed tags, e.g. '[<tag>]'\n"
                          "Applies to: ePub, MOBI output formats")),
                   Option('--exclude-tags',
-                          default='~,Catalog',
+                          default=('~,'+_('Catalog')),
                          dest='exclude_tags',
                          help=_("Comma-separated list of tag words indicating book should be excluded from output.  Case-insensitive.\n"
                          "--exclude-tags=skip will match 'skip this book' and 'Skip will like this'.\n"
@ -2497,6 +2506,7 @@ class EPUB_MOBI(CatalogPlugin):
                    pw.MagickThumbnailImage(thumb, 75, 100)
                    pw.MagickWriteImage(thumb, os.path.join(image_dir, thumb_file))
                    pw.DestroyMagickWand(thumb)
+                    pw.DestroyMagickWand(img)
                except IOError:
                    print "generate_thumbnail() IOError with %s" % title['title']
                except RuntimeError:
@ -2541,11 +2551,8 @@ class EPUB_MOBI(CatalogPlugin):
            return "%.2f%% %s" % (self.progressInt, self.progressString)

    def run(self, path_to_output, opts, db, notification=DummyReporter()):
-        import gc
        from calibre.utils.logging import Log

-        gc.set_debug(gc.DEBUG_LEAK)
-
        log = Log()
        opts.fmt = self.fmt = path_to_output.rpartition('.')[2]
        self.opts = opts
@ -2562,13 +2569,18 @@ class EPUB_MOBI(CatalogPlugin):
            log("%s:run" % self.name)
            log(" path_to_output: %s" % path_to_output)
            log(" Output format: %s" % self.fmt)
-            log(" Book count: %d" % len(opts_dict['ids']))
+            if opts_dict['ids']:
+                log(" Book count: %d" % len(opts_dict['ids']))
            # Display opts
            keys = opts_dict.keys()
            keys.sort()
            log(" opts:")
            for key in keys:
-                if key == 'ids': continue
+                if key == 'ids':
+                    if opts_dict[key]:
+                        continue
+                    else:
+                        log("  %s: (all)" % key)
                log("  %s: %s" % (key, opts_dict[key]))

        # Launch the Catalog builder
@ -2593,5 +2605,3 @@ class EPUB_MOBI(CatalogPlugin):
        plumber.merge_ui_recommendations(recommendations)

        plumber.run()
-
-        print gc.garbage
--- a/src/calibre/library/save_to_disk.py
+++ b/src/calibre/library/save_to_disk.py
@ -112,6 +112,7 @@ def get_components(template, mi, id, timefmt='%b %Y', length=250,
        format_args['title'] = mi.title
    if mi.authors:
        format_args['authors'] = mi.format_authors()
+        format_args['author'] = format_args['authors']
    if mi.author_sort:
        format_args['author_sort'] = mi.author_sort
    if mi.tags: