Merge from trunk

2025-11-16 11:33:04 -05:00 · 2010-12-19 09:56:21 +00:00 · 2010-12-19 09:56:21 +00:00 · 8b56f2d8c7
commit 8b56f2d8c7
parent 19c4afd222 c9fe094afa
11 changed files with 172 additions and 98 deletions
--- a/Changelog.yaml
+++ b/Changelog.yaml
@ -11,7 +11,7 @@
    - title: "Page turn animations in the e-book viewer"
      type: major
      description: >
-        "Now when you use the Page Down/Page Up keys or the next/previous page buttons in the viewer, page turning will be animated. The duration of the animation can be controlled in the viewer preferences. Setting it to o disables the animation completely."
+        "Now when you use the Page Down/Page Up keys or the next/previous page buttons in the viewer, page turning will be animated. The duration of the animation can be controlled in the viewer preferences. Setting it to 0 disables the animation completely."

    - title: "Conversion pipeline: Add an option to set the minimum line height of all elemnts as a percentage of the computed font size. By default, calibre now sets the line height to 120% of the computed font size."

--- a/resources/recipes/johm.recipe
+++ b/resources/recipes/johm.recipe
@ -1,78 +1,72 @@
-# -*- coding: utf-8 -*-
-
+import re
 from calibre.web.feeds.recipes import BasicNewsRecipe

 class JournalofHospitalMedicine(BasicNewsRecipe):

    title       = 'Journal of Hospital Medicine'
-    __author__  = 'Krittika Goyal'
+    __author__  = 'Kovid Goyal'
    description = 'Medical news'
    timefmt = ' [%d %b, %Y]'
    needs_subscription = True
    language = 'en'

    no_stylesheets = True
+    keep_only_tags = [dict(id=['articleTitle', 'articleMeta', 'fulltext'])]
+    remove_tags = [dict(attrs={'class':'licensedContent'})]


   # TO LOGIN
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.open('http://www3.interscience.wiley.com/cgi-bin/home')
-        br.select_form(name='siteLogin')
-        br['LoginName'] = self.username
-        br['Password'] = self.password
+        br.select_form(nr=0)
+        br['j_username'] = self.username
+        br['j_password'] = self.password
        response = br.submit()
        raw = response.read()
-        if 'userName = ""' in raw:
+        if '<h2>LOGGED IN</h2>' not in raw:
            raise Exception('Login failed. Check your username and password')
        return br

    #TO GET ARTICLE TOC
    def johm_get_index(self):
-        return self.index_to_soup('http://www3.interscience.wiley.com/journal/111081937/home')
+        return self.index_to_soup('http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1553-5606/currentissue')

    # To parse artice toc
    def parse_index(self):
-        parse_soup = self.johm_get_index()
-
-        div = parse_soup.find(id='contentCell')
-
-        current_section = None
-        current_articles = []
+        soup = self.johm_get_index()
+        toc = soup.find(id='issueTocGroups')
        feeds = []
-        for x in div.findAll(True):
-            if x.name == 'h4':
-                # Section heading found
-                if current_articles and current_section:
-                    feeds.append((current_section, current_articles))
-                current_section = self.tag_to_string(x)
-                current_articles = []
-                self.log('\tFound section:', current_section)
-            if current_section is not None and x.name == 'strong':
-                title = self.tag_to_string(x)
-                p = x.parent.parent.find('a', href=lambda x: x and '/HTMLSTART' in x)
-                if p is None:
-                    continue
-                url = p.get('href', False)
-                if not url or not title:
+        for group in toc.findAll('li', id=re.compile(r'group\d+')):
+            gtitle = group.find(attrs={'class':'subSectionHeading'})
+            if gtitle is None:
+                continue
+            gtitle = self.tag_to_string(gtitle)
+            arts = group.find(attrs={'class':'articles'})
+            if arts is None:
+                continue
+            self.log('Found section:', gtitle)
+            articles = []
+            for art in arts.findAll(attrs={'class':lambda x: x and 'tocArticle'
+                in x}):
+                a = art.find('a', href=True)
+                if a is None:
                    continue
+                url = a.get('href')
                if url.startswith('/'):
-                        url = 'http://www3.interscience.wiley.com'+url
-                url = url.replace('/HTMLSTART', '/main.html,ftx_abs')
-                self.log('\t\tFound article:', title)
-                self.log('\t\t\t', url)
-                #if url.startswith('/'):
-                    #url = 'http://online.wsj.com'+url
-                current_articles.append({'title': title, 'url':url,
-                    'description':'', 'date':''})
-
-        if current_articles and current_section:
-            feeds.append((current_section, current_articles))
+                    url = 'http://onlinelibrary.wiley.com' + url
+                url = url.replace('/abstract', '/full')
+                title = self.tag_to_string(a)
+                a.extract()
+                pm = art.find(attrs={'class':'productMenu'})
+                if pm is not None:
+                    pm.extract()
+                desc = self.tag_to_string(art)
+                self.log('\tFound article:', title, 'at', url)
+                articles.append({'title':title, 'url':url, 'description':desc,
+                    'date':''})
+            if articles:
+                feeds.append((gtitle, articles))

        return feeds

-    def preprocess_html(self, soup):
-        for img in soup.findAll('img', src=True):
-            img['src'] = img['src'].replace('tfig', 'nfig')
-        return soup
-
--- a/resources/recipes/nejm.recipe
+++ b/resources/recipes/nejm.recipe
@ -4,7 +4,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
 class NYTimes(BasicNewsRecipe):

    title       = 'New England Journal of Medicine'
-    __author__  = 'Krittika Goyal'
+    __author__  = 'Kovid Goyal'
    description = 'Medical news'
    timefmt = ' [%d %b, %Y]'
    needs_subscription = True
--- a/setup/installer/windows/notes.rst
+++ b/setup/installer/windows/notes.rst
@ -36,6 +36,16 @@ Install BeautifulSoup 3.0.x manually into site-packages (3.1.x parses broken HTM

 Install pywin32 and edit win32com\__init__.py setting _frozen = True and
 __gen_path__ to a temp dir (otherwise it tries to set it to a dir in the install tree which leads to permission errors)
+Note that you should use::
+
+    import tempfile
+    __gen_path__ = os.path.join(
+                            tempfile.gettempdir(), "gen_py",
+                            "%d.%d" % (sys.version_info[0], sys.version_info[1]))
+
+Use gettempdir instead of the win32 api method as gettempdir returns a temp dir that is guaranteed to actually work.
+
+
 Also edit win32com\client\gencache.py and change the except IOError on line 57 to catch all exceptions.

 SQLite
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -28,7 +28,7 @@ class ANDROID(USBMS):

            # Motorola
            0x22b8 : { 0x41d9 : [0x216], 0x2d67 : [0x100], 0x41db : [0x216],
-                0x4285 : [0x216]},
+                0x4285 : [0x216], 0x42a3 : [0x216] },

            # Sony Ericsson
            0xfce : { 0xd12e : [0x0100]},
--- a/src/calibre/devices/misc.py
+++ b/src/calibre/devices/misc.py
@ -62,9 +62,9 @@ class SWEEX(USBMS):
    # Ordered list of supported formats
    FORMATS     = ['epub', 'prc', 'fb2', 'html', 'rtf', 'chm', 'pdf', 'txt']

-    VENDOR_ID   = [0x0525]
-    PRODUCT_ID  = [0xa4a5]
-    BCD         = [0x0319]
+    VENDOR_ID   = [0x0525, 0x177f]
+    PRODUCT_ID  = [0xa4a5, 0x300]
+    BCD         = [0x0319, 0x110]

    VENDOR_NAME = 'SWEEX'
    WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'EBOOKREADER'
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@ -27,13 +27,10 @@ class FB2MLizer(object):
    '''
    Todo: * Include more FB2 specific tags in the conversion.
          * Handle a tags.
-          * Figure out some way to turn oeb_book.toc items into <section><title>
-            <p> to allow for readers to generate toc from the document.
    '''

    def __init__(self, log):
        self.log = log
-        self.image_hrefs = {}
        self.reset_state()

    def reset_state(self):
@ -43,17 +40,25 @@ class FB2MLizer(object):
        # in different directories. FB2 images are all in a flat layout so we rename all images
        # into a sequential numbering system to ensure there are no collisions between image names.
        self.image_hrefs = {}
+        # Mapping of toc items and their 
+        self.toc = {}
+        # Used to see whether a new <section> needs to be opened
+        self.section_level = 0

    def extract_content(self, oeb_book, opts):
        self.log.info('Converting XHTML to FB2 markup...')
        self.oeb_book = oeb_book
        self.opts = opts
+        self.reset_state()
+        
+        # Used for adding <section>s and <title>s to allow readers
+        # to generate toc from the document.
+        if self.opts.sectionize == 'toc':
+            self.create_flat_toc(self.oeb_book.toc, 1)

        return self.fb2mlize_spine()

    def fb2mlize_spine(self):
-        self.reset_state()
-
        output = [self.fb2_header()]
        output.append(self.get_text())
        output.append(self.fb2mlize_images())
@ -66,13 +71,19 @@ class FB2MLizer(object):
            return u'<?xml version="1.0" encoding="UTF-8"?>' + output

    def clean_text(self, text):
-        text = re.sub(r'(?miu)<section>\s*</section>', '', text)
-        text = re.sub(r'(?miu)\s+</section>', '</section>', text)
-        text = re.sub(r'(?miu)</section><section>', '</section>\n\n<section>', text)
-
        text = re.sub(r'(?miu)<p>\s*</p>', '', text)
-        text = re.sub(r'(?miu)\s+</p>', '</p>', text)
-        text = re.sub(r'(?miu)</p><p>', '</p>\n\n<p>', text)
+        text = re.sub(r'(?miu)\s*</p>', '</p>', text)
+        text = re.sub(r'(?miu)</p>\s*<p>', '</p>\n\n<p>', text)
+        
+        text = re.sub(r'(?miu)<title>\s*</title>', '', text)
+        text = re.sub(r'(?miu)\s+</title>', '</title>', text)
+        
+        text = re.sub(r'(?miu)<section>\s*</section>', '', text)
+        text = re.sub(r'(?miu)\s*</section>', '\n</section>', text)
+        text = re.sub(r'(?miu)</section>\s*', '</section>\n\n', text)
+        text = re.sub(r'(?miu)\s*<section>', '\n<section>', text)
+        text = re.sub(r'(?miu)<section>\s*', '<section>\n', text)
+        text = re.sub(r'(?miu)</section><section>', '</section>\n\n<section>', text)
        
        if self.opts.insert_blank_line:
            text = re.sub(r'(?miu)</p>', '</p><empty-line />', text)
@ -144,12 +155,34 @@ class FB2MLizer(object):

    def get_text(self):
        text = ['<body>']
+        
+        # Create main section if there are no others to create
+        if self.opts.sectionize == 'nothing':
+            text.append('<section>')
+            self.section_level += 1
+        
        for item in self.oeb_book.spine:
            self.log.debug('Converting %s to FictionBook2 XML' % item.href)
            stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
-            text.append('<section>')
+            
+            # Start a <section> if we must sectionize each file or if the TOC references this page
+            page_section_open = False
+            if self.opts.sectionize == 'files' or self.toc.get(item.href) == 'page':
+                text.append('<section>')
+                page_section_open = True
+                self.section_level += 1
+            
            text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
+            
+            if page_section_open:
+                text.append('</section>')
+                self.section_level -= 1
+                
+        # Close any open sections
+        while self.section_level > 0:
            text.append('</section>')
+            self.section_level -= 1
+
        return ''.join(text) + '</body>'

    def fb2mlize_images(self):
@ -184,6 +217,17 @@ class FB2MLizer(object):
                        '%s.' % (item.href, e))
        return ''.join(images)

+    def create_flat_toc(self, nodes, level):
+        for item in nodes:
+            href, mid, id = item.href.partition('#')
+            if not id:
+                self.toc[href] = 'page'
+            else:
+                if not self.toc.get(href, None):
+                    self.toc[href] = {}
+                self.toc[href][id] = level
+                self.create_flat_toc(item.nodes, level + 1)
+
    def ensure_p(self):
        if self.in_p:
            return [], []
@ -254,10 +298,38 @@ class FB2MLizer(object):
        # First tag in tree
        tag = barename(elem_tree.tag)

+        # Convert TOC entries to <title>s and add <section>s
+        if self.opts.sectionize == 'toc':
+            # A section cannot be a child of any other element than another section,
+            # so leave the tag alone if there are parents
+            if not tag_stack:
+                # There are two reasons to start a new section here: the TOC pointed to
+                # this page (then we use the first non-<body> on the page as a <title>), or
+                # the TOC pointed to a specific element
+                newlevel = 0
+                toc_entry = self.toc.get(page.href, None)
+                if toc_entry == 'page':
+                    if tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text:
+                        newlevel = 1
+                        self.toc[page.href] = None
+                elif toc_entry and elem_tree.attrib.get('id', None):
+                    newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None)
+                    
+                # Start a new section if necessary
+                if newlevel:
+                    if not (newlevel > self.section_level):
+                        fb2_out.append('</section>')
+                        self.section_level -= 1
+                    fb2_out.append('<section>')
+                    self.section_level += 1
+                    fb2_out.append('<title>')
+                    tags.append('title')
+            if self.section_level == 0:
+                # If none of the prior processing made a section, make one now to be FB2 spec compliant
+                fb2_out.append('<section>')
+                self.section_level += 1
+
        # Process the XHTML tag if it needs to be converted to an FB2 tag.
-        if tag == 'h1' and self.opts.h1_to_title or tag == 'h2' and self.opts.h2_to_title or tag == 'h3' and self.opts.h3_to_title:
-            fb2_out.append('<title>')
-            tags.append('title')
        if tag == 'img':
            if elem_tree.attrib.get('src', None):
                # Only write the image tag if it is in the manifest.
--- a/src/calibre/ebooks/fb2/output.py
+++ b/src/calibre/ebooks/fb2/output.py
@ -16,15 +16,15 @@ class FB2Output(OutputFormatPlugin):
    file_type = 'fb2'

    options = set([
-        OptionRecommendation(name='h1_to_title',
-            recommended_value=False, level=OptionRecommendation.LOW,
-            help=_('Wrap all h1 tags with fb2 title elements.')),
-        OptionRecommendation(name='h2_to_title',
-            recommended_value=False, level=OptionRecommendation.LOW,
-            help=_('Wrap all h2 tags with fb2 title elements.')),
-        OptionRecommendation(name='h3_to_title',
-            recommended_value=False, level=OptionRecommendation.LOW,
-            help=_('Wrap all h3 tags with fb2 title elements.')),
+        OptionRecommendation(name='sectionize',
+            recommended_value='files', level=OptionRecommendation.LOW,
+            choices=['toc', 'files', 'nothing'],
+            help=_('Specify the sectionization of elements. '
+                'A value of "nothing" turns the book into a single section. '
+                'A value of "files" turns each file into a separate section; use this if your device is having trouble. '
+                'A value of "Table of Contents" turns the entries in the Table of Contents into titles and creates sections; '
+                'if it fails, adjust the "Structure Detection" and/or "Table of Contents" settings '
+                '(turn on "Force use of auto-generated Table of Contents).')),
    ])

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -245,7 +245,7 @@ class RTFInput(InputFormatPlugin):
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
-        self.options = options
+        self.opts = options
        self.log = log
        self.log('Converting RTF to XML...')
        #Name of the preprocesssed RTF file
@ -290,12 +290,12 @@ class RTFInput(InputFormatPlugin):
            res = transform.tostring(result)
            res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
            # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
-            if not getattr(self.options, 'remove_paragraph_spacing', False):
+            if not getattr(self.opts, 'remove_paragraph_spacing', False):
                res = re.sub('\s*<body>', '<body>', res)
                res = re.sub('(?<=\n)\n{2}',
                        u'<p>\u00a0</p>\n'.encode('utf-8'), res)
-            if self.options.preprocess_html:
-                preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
+            if self.opts.preprocess_html:
+                preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
                res = preprocessor(res)
            f.write(res)
        self.write_inline_css(inline_class, border_styles)
--- a/src/calibre/gui2/convert/fb2_output.py
+++ b/src/calibre/gui2/convert/fb2_output.py
@ -17,6 +17,8 @@ class PluginWidget(Widget, Ui_Form):
    ICON = I('mimetypes/fb2.png')

    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
-        Widget.__init__(self, parent, ['h1_to_title', 'h2_to_title', 'h3_to_title'])
+        Widget.__init__(self, parent, ['sectionize'])
        self.db, self.book_id = db, book_id
+        for x in ('toc', 'files', 'nothing'):
+            self.opt_sectionize.addItem(x)
        self.initialize_options(get_option, get_help, db, book_id)
--- a/src/calibre/gui2/convert/fb2_output.ui
+++ b/src/calibre/gui2/convert/fb2_output.ui
@ -14,7 +14,7 @@
   <string>Form</string>
  </property>
  <layout class="QGridLayout" name="gridLayout">
-   <item row="3" column="0">
+   <item row="1" column="0">
    <spacer name="verticalSpacer">
     <property name="orientation">
      <enum>Qt::Vertical</enum>
@ -28,23 +28,19 @@
    </spacer>
   </item>
   <item row="0" column="0">
-    <widget class="QCheckBox" name="opt_h1_to_title">
-     <property name="text">
-      <string>Wrap h1 tags with &lt;title&gt; elements</string>
-     </property>
-    </widget>
-   </item>
-   <item row="1" column="0">
-    <widget class="QCheckBox" name="opt_h2_to_title">
-     <property name="text">
-      <string>Wrap h2 tags with &lt;title&gt; elements</string>
-     </property>
-    </widget>
-   </item>
-   <item row="2" column="0">
-    <widget class="QCheckBox" name="opt_h3_to_title">
-     <property name="text">
-      <string>Wrap h3 tags with &lt;title&gt; elements</string>
+    <widget class="QLabel" name="label">
+    <property name="text">
+     <string>Sectionize:</string>
+    </property>
+    <property name="buddy">
+     <cstring>opt_sectionize</cstring>
+    </property>
+   </widget>
+  </item>
+  <item row="0" column="1">
+   <widget class="QComboBox" name="opt_sectionize">
+    <property name="minimumContentsLength">
+     <number>20</number>
     </property>
    </widget>
   </item>