Sync to trunk

2025-08-30 23:00:21 -04:00 · 2009-01-07 07:06:16 -05:00 · 2009-01-07 07:06:16 -05:00 · 86efeee9e5
commit 86efeee9e5
parent 26d9e18669 42e8e8e5d2
8 changed files with 165 additions and 67 deletions
--- a/src/calibre/ebooks/lrf/comic/convert_from.py
+++ b/src/calibre/ebooks/lrf/comic/convert_from.py
@ -10,6 +10,14 @@ Based on ideas from comiclrf created by FangornUK.
 import os, sys, shutil, traceback, textwrap
 from uuid import uuid4

+try:
+    from reportlab.pdfgen import canvas
+    _reportlab = True
+except:
+    _reportlab = False
+
+
+
 from calibre import extract, terminal_controller, __appname__, __version__
 from calibre.utils.config import Config, StringConfig
 from calibre.ptempfile import PersistentTemporaryDirectory
@ -43,7 +51,7 @@ PROFILES = {
            # Name : (width, height) in pixels
            'prs500':(584, 754),
            # The SONY's LRF renderer (on the PRS500) only uses the first 800x600 block of the image 
-            #'prs500-landscape': (784, 1200-92)
+            'prs500-landscape': (784, 1012)
            }

 def extract_comic(path_to_comic_file):
@ -279,7 +287,7 @@ def process_pages(pages, opts, update):
        failures += failures_
    return ans, failures, tdir
    
-def config(defaults=None):
+def config(defaults=None,output_format='lrf'):
    desc = _('Options to control the conversion of comics (CBR, CBZ) files into ebooks')
    if defaults is None:
        c = Config('comic', desc)
@ -316,10 +324,13 @@ def config(defaults=None):
              help=_('Be verbose, useful for debugging. Can be specified multiple times for greater verbosity.'))
    c.add_opt('no_progress_bar', ['--no-progress-bar'], default=False,
                      help=_("Don't show progress bar."))
+    if output_format == 'pdf':
+        c.add_opt('no_process',['--no_process'], default=False,
+    		      help=_("Apply no processing to the image"))
    return c

-def option_parser():
-    c = config()
+def option_parser(output_format='lrf'):
+    c = config(output_format=output_format)
    return c.option_parser(usage=_('''\
 %prog [options] comic.cb[z|r]

@ -383,6 +394,24 @@ def create_lrf(pages, profile, opts, thumbnail=None):
    print _('Output written to'), opts.output
    

+def create_pdf(pages, profile, opts, thumbnail=None):
+    width, height = PROFILES[profile]
+
+    if not _reportlab:
+            raise RuntimeError('Failed to load reportlab')
+
+    pdf = canvas.Canvas(filename=opts.output, pagesize=(width,height+15))
+    pdf.setAuthor(opts.author)
+    pdf.setTitle(opts.title)
+
+
+    for page in pages:
+        pdf.drawImage(page, x=0,y=0,width=width, height=height) 
+        pdf.showPage()
+    # Write the document to disk
+    pdf.save() 
+
+    
 def do_convert(path_to_file, opts, notification=lambda m, p: p, output_format='lrf'):
    path_to_file = run_plugins_on_preprocess(path_to_file)
    source = path_to_file
@ -393,29 +422,33 @@ def do_convert(path_to_file, opts, notification=lambda m, p: p, output_format='l
        opts.output = os.path.abspath(os.path.splitext(os.path.basename(source))[0]+'.'+output_format)
    tdir  = extract_comic(source)
    pages = find_pages(tdir, sort_on_mtime=opts.no_sort, verbose=opts.verbose)
+    thumbnail = None
    if not pages:
        raise ValueError('Could not find any pages in the comic: %s'%source)
-    pages, failures, tdir2 = process_pages(pages, opts, notification)
-    if not pages:
-        raise ValueError('Could not find any valid pages in the comic: %s'%source)
-    if failures:
-        print 'Could not process the following pages (run with --verbose to see why):'
-        for f in failures:
-            print '\t', f
-    thumbnail = os.path.join(tdir2, 'thumbnail.png')
-    if not os.access(thumbnail, os.R_OK):
-        thumbnail = None
-    
+    if not opts.no_process:
+        pages, failures, tdir2 = process_pages(pages, opts, notification)
+        if not pages:
+            raise ValueError('Could not find any valid pages in the comic: %s'%source)
+        if failures:
+            print 'Could not process the following pages (run with --verbose to see why):'
+            for f in failures:
+                print '\t', f
+        thumbnail = os.path.join(tdir2, 'thumbnail.png')
+        if not os.access(thumbnail, os.R_OK):
+            thumbnail = None
    if output_format == 'lrf':
        create_lrf(pages, opts.profile, opts, thumbnail=thumbnail)
-    else:
+    if output_format == 'epub':
        create_epub(pages, opts.profile, opts, thumbnail=thumbnail)
+    if output_format == 'pdf':
+        create_pdf(pages, opts.profile, opts, thumbnail=thumbnail)
    shutil.rmtree(tdir)
-    shutil.rmtree(tdir2)
+    if not opts.no_process:
+        shutil.rmtree(tdir2)


 def main(args=sys.argv, notification=None, output_format='lrf'):
-    parser = option_parser()
+    parser = option_parser(output_format=output_format)
    opts, args = parser.parse_args(args)
    if len(args) < 2:
        parser.print_help()
@ -429,7 +462,6 @@ def main(args=sys.argv, notification=None, output_format='lrf'):
    
    source = os.path.abspath(args[1])
    do_convert(source, opts, notification, output_format=output_format)
-    
    return 0

 if __name__ == '__main__':
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -12,15 +12,17 @@ try:
 except ImportError:
    import Image as PILImage

+from lxml import html, etree
+
 from calibre import __appname__, entity_to_unicode
 from calibre.ebooks import DRMError
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
+from calibre.ebooks.chardet import ENCODING_PATS
 from calibre.ebooks.mobi import MobiError
 from calibre.ebooks.mobi.huffcdic import HuffReader
 from calibre.ebooks.mobi.palmdoc import decompress_doc
 from calibre.ebooks.mobi.langcodes import main_language, sub_language
 from calibre.ebooks.metadata import MetaInformation
-from calibre.ebooks.metadata.opf import OPFCreator
+from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre import sanitize_file_name

@ -176,6 +178,8 @@ class MobiReader(object):
        processed_records = self.extract_text()
        self.add_anchors()
        self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore')
+        for pat in ENCODING_PATS:
+            self.processed_html = pat.sub('', self.processed_html)
        self.extract_images(processed_records, output_dir)
        self.replace_page_breaks()
        self.cleanup_html()
@ -185,7 +189,6 @@ class MobiReader(object):
        self.processed_html = \
            re.compile('<head>', re.IGNORECASE).sub(
                '\n<head>\n'
-                '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n'
                '<style type="text/css">\n'
                'blockquote { margin: 0em 0em 0em 1.25em; text-align: justify; }\n'
                'p { margin: 0em; text-align: justify; }\n'
@ -196,23 +199,33 @@ class MobiReader(object):
        
        if self.verbose:
            print 'Parsing HTML...'
-        soup = BeautifulSoup(self.processed_html)
-        self.cleanup_soup(soup)
-        guide = soup.find('guide')
-        for elem in soup.findAll(['metadata', 'guide']):
-            elem.extract()
+        root = html.fromstring(self.processed_html)
+        self.upshift_markup(root)
+        guides = root.xpath('//guide')
+        guide = guides[0] if guides else None
+        for elem in guides + root.xpath('//metadata'):
+            elem.getparent().remove(elem)
        htmlfile = os.path.join(output_dir, 
                                sanitize_file_name(self.name)+'.html')
        try:
-            for ref in guide.findAll('reference', href=True):
-                ref['href'] = os.path.basename(htmlfile)+ref['href']
+            for ref in guide.xpath('descendant::reference'):
+                if ref.attrib.has_key('href'):
+                    ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href']
        except AttributeError:
            pass
+        if self.verbose:
+            print 'Serializing...'
        with open(htmlfile, 'wb') as f:
-            f.write(unicode(soup).encode('utf8'))
+            raw = html.tostring(root, encoding='utf-8', method='xml', 
+                         include_meta_content_type=True, pretty_print=True)
+            raw = raw.replace('<head>', 
+            '<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n')
+            f.write(raw)
        self.htmlfile = htmlfile
        
        if self.book_header.exth is not None:
+            if self.verbose:
+                print 'Creating OPF...'
            ncx = cStringIO.StringIO()
            opf = self.create_opf(htmlfile, guide)
            opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx)
@ -231,9 +244,9 @@ class MobiReader(object):
            self.processed_html = re.sub(r'(?i)<%s>'%t, r'<span class="%s">'%c, self.processed_html)
            self.processed_html = re.sub(r'(?i)</%s>'%t, r'</span>', self.processed_html)
        
-    def cleanup_soup(self, soup):
+    def upshift_markup(self, root):
        if self.verbose:
-            print 'Replacing height, width and align attributes'
+            print 'Converting style information to CSS...'
        size_map = {
                    'xx-small' : '0.5',
                    'x-small'  : '1',
@ -243,41 +256,36 @@ class MobiReader(object):
                    'x-large'  : '5',
                    'xx-large' : '6',
                    }
-        for tag in soup.recursiveChildGenerator():
-            if not isinstance(tag, Tag): continue
-            styles = []
-            try:
-                styles.append(tag['style'])
-            except KeyError:
-                pass
-            try:
-                styles.append('margin-top: %s' % tag['height'])
-                del tag['height']
-            except KeyError:
-                pass
-            try:
-                styles.append('text-indent: %s' % tag['width'])
-                if tag['width'].startswith('-'):
-                    styles.append('margin-left: %s'%(tag['width'][1:]))
-                del tag['width']
-            except KeyError:
-                pass
-            try:
-                styles.append('text-align: %s' % tag['align'])
-                del tag['align']
-            except KeyError:
-                pass
+        for tag in root.iter(etree.Element):
+            styles, attrib = [], tag.attrib
+            if attrib.has_key('style'):
+                style = attrib.pop('style').strip()
+                if style:
+                    styles.append(style)
+            if attrib.has_key('height'):
+                height = attrib.pop('height').strip()
+                if height:
+                    styles.append('margin-top: %s' % height)
+            if attrib.has_key('width'):
+                width = attrib.pop('width').strip()
+                if width:
+                    styles.append('text-indent: %s' % width)
+                    if width.startswith('-'):
+                        styles.append('margin-left: %s'%(width[1:]))
+            if attrib.has_key('align'):
+                align = attrib.pop('align').strip()
+                if align:
+                    styles.append('text-align: %s' % align)
            if styles:
-                tag['style'] = '; '.join(styles)
+                attrib['style'] = '; '.join(styles)
                
-            if tag.name.lower() == 'font':
-                sz = tag.get('size', '')
+            if tag.tag.lower() == 'font':
+                sz = tag.get('size', '').lower()
                try:
                    float(sz)
                except ValueError:
-                    sz = sz.lower()
                    if sz in size_map.keys():
-                        tag['size'] = size_map[sz]
+                        attrib['size'] = size_map[sz]
    
    def create_opf(self, htmlfile, guide=None):
        mi = self.book_header.exth.mi
@ -292,7 +300,7 @@ class MobiReader(object):
        opf.create_manifest(manifest)
        opf.create_spine([os.path.basename(htmlfile)])
        toc = None
-        if guide:
+        if guide is not None:
            opf.create_guide(guide)
            for ref in opf.guide:
                if ref.type.lower() == 'toc':
@ -303,16 +311,16 @@ class MobiReader(object):
            ent_pat = re.compile(r'&(\S+?);')
            if index > -1:
                raw = '<html><body>'+self.processed_html[index:]
-                soup = BeautifulSoup(raw)
+                root = html.fromstring(raw)
                tocobj = TOC()
-                for a in soup.findAll('a', href=True):
+                for a in root.xpath('//a[@href]'):
                    try:
-                        text = u''.join(a.findAll(text=True)).strip()
+                        text = u' '.join([t.strip() for t in a.xpath('descendant::text()')])
                    except:
                        text = ''
                    text = ent_pat.sub(entity_to_unicode, text)
-                    if a['href'].startswith('#'):
-                        tocobj.add_item(toc.partition('#')[0], a['href'][1:], text)
+                    if a.get('href', '').startswith('#'):
+                        tocobj.add_item(toc.partition('#')[0], a.attrib['href'][1:], text)
            if tocobj is not None:
                opf.set_toc(tocobj)
        
--- a/src/calibre/ebooks/pdf/init.py
+++ b/src/calibre/ebooks/pdf/init.py
@ -0,0 +1,9 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Used for pdf output for comic2pdf
+'''
+
--- a/src/calibre/ebooks/pdf/from_comic.py
+++ b/src/calibre/ebooks/pdf/from_comic.py
@ -0,0 +1,21 @@
+from __future__ import with_statement
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'Convert a comic in CBR/CBZ format to pdf'
+
+import sys
+from functools import partial
+from calibre.ebooks.lrf.comic.convert_from import do_convert, option_parser, config, main as _main
+
+convert = partial(do_convert, output_format='pdf')
+main    = partial(_main, output_format='pdf')
+
+if __name__ == '__main__':
+    sys.exit(main())
+
+if False:
+    option_parser
+    config
+    
--- a/src/calibre/gui2/images/news/joelonsoftware.png
+++ b/src/calibre/gui2/images/news/joelonsoftware.png
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@ -59,6 +59,7 @@ entry_points = {
                             'oeb2lit   = calibre.ebooks.lit.writer:main',
                             'comic2lrf = calibre.ebooks.lrf.comic.convert_from:main',
                             'comic2epub = calibre.ebooks.epub.from_comic:main',
+			     'comic2pdf  = calibre.ebooks.pdf.from_comic:main',
                             'calibre-debug      = calibre.debug:main',
                             'calibredb          = calibre.library.cli:main',
                             'calibre-fontconfig = calibre.utils.fontconfig:main',
@ -228,6 +229,7 @@ def setup_completion(fatal_errors):
        f.write(opts_and_exts('lit2oeb', lit2oeb, ['lit']))
        f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr']))
        f.write(opts_and_exts('comic2epub', comic2epub, ['cbz', 'cbr']))
+	f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr']))
        f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
        f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
        f.write(opts_and_words('feeds2lrf', feeds2epub, feed_titles))
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -21,6 +21,7 @@ recipe_modules = ['recipe_' + r for r in (
           'linux_magazine', 'telegraph_uk', 'utne', 'sciencedaily', 'forbes',
           'time_magazine', 'endgadget', 'fudzilla', 'nspm_int', 'nspm', 'pescanik',
           'spiegel_int', 'themarketticker', 'tomshardware', 'xkcd', 'ftd', 'zdnet',
+           'joelonsoftware',
          )]

 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/recipe_joelonsoftware.py
+++ b/src/calibre/web/feeds/recipes/recipe_joelonsoftware.py
@ -0,0 +1,25 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
+'''
+joelonsoftware.com
+'''
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Joelonsoftware(BasicNewsRecipe):
+    
+    title       = 'Joel on Software'
+    __author__  = 'Darko Miletic'
+    description = 'Painless Software Management'
+    no_stylesheets = True
+    use_embedded_content  = True
+    
+    cover_url = 'http://www.joelonsoftware.com/RssJoelOnSoftware.jpg'
+    
+    html2lrf_options = [  '--comment'       , description
+                        , '--category'      , 'blog,software,news'
+                        , '--author'        , 'Joel Spolsky'
+                       ]
+    
+    feeds = [(u'Articles', u'http://www.joelonsoftware.com/rss.xml')]