Merge from trunk

This commit is contained in:
Charles Haley 2010-12-19 09:56:21 +00:00
commit 8b56f2d8c7
11 changed files with 172 additions and 98 deletions

View File

@ -11,7 +11,7 @@
- title: "Page turn animations in the e-book viewer" - title: "Page turn animations in the e-book viewer"
type: major type: major
description: > description: >
"Now when you use the Page Down/Page Up keys or the next/previous page buttons in the viewer, page turning will be animated. The duration of the animation can be controlled in the viewer preferences. Setting it to o disables the animation completely." "Now when you use the Page Down/Page Up keys or the next/previous page buttons in the viewer, page turning will be animated. The duration of the animation can be controlled in the viewer preferences. Setting it to 0 disables the animation completely."
- title: "Conversion pipeline: Add an option to set the minimum line height of all elemnts as a percentage of the computed font size. By default, calibre now sets the line height to 120% of the computed font size." - title: "Conversion pipeline: Add an option to set the minimum line height of all elemnts as a percentage of the computed font size. By default, calibre now sets the line height to 120% of the computed font size."

View File

@ -1,78 +1,72 @@
# -*- coding: utf-8 -*- import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class JournalofHospitalMedicine(BasicNewsRecipe): class JournalofHospitalMedicine(BasicNewsRecipe):
title = 'Journal of Hospital Medicine' title = 'Journal of Hospital Medicine'
__author__ = 'Krittika Goyal' __author__ = 'Kovid Goyal'
description = 'Medical news' description = 'Medical news'
timefmt = ' [%d %b, %Y]' timefmt = ' [%d %b, %Y]'
needs_subscription = True needs_subscription = True
language = 'en' language = 'en'
no_stylesheets = True no_stylesheets = True
keep_only_tags = [dict(id=['articleTitle', 'articleMeta', 'fulltext'])]
remove_tags = [dict(attrs={'class':'licensedContent'})]
# TO LOGIN # TO LOGIN
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
br.open('http://www3.interscience.wiley.com/cgi-bin/home') br.open('http://www3.interscience.wiley.com/cgi-bin/home')
br.select_form(name='siteLogin') br.select_form(nr=0)
br['LoginName'] = self.username br['j_username'] = self.username
br['Password'] = self.password br['j_password'] = self.password
response = br.submit() response = br.submit()
raw = response.read() raw = response.read()
if 'userName = ""' in raw: if '<h2>LOGGED IN</h2>' not in raw:
raise Exception('Login failed. Check your username and password') raise Exception('Login failed. Check your username and password')
return br return br
#TO GET ARTICLE TOC #TO GET ARTICLE TOC
def johm_get_index(self): def johm_get_index(self):
return self.index_to_soup('http://www3.interscience.wiley.com/journal/111081937/home') return self.index_to_soup('http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1553-5606/currentissue')
# To parse artice toc # To parse artice toc
def parse_index(self): def parse_index(self):
parse_soup = self.johm_get_index() soup = self.johm_get_index()
toc = soup.find(id='issueTocGroups')
div = parse_soup.find(id='contentCell')
current_section = None
current_articles = []
feeds = [] feeds = []
for x in div.findAll(True): for group in toc.findAll('li', id=re.compile(r'group\d+')):
if x.name == 'h4': gtitle = group.find(attrs={'class':'subSectionHeading'})
# Section heading found if gtitle is None:
if current_articles and current_section: continue
feeds.append((current_section, current_articles)) gtitle = self.tag_to_string(gtitle)
current_section = self.tag_to_string(x) arts = group.find(attrs={'class':'articles'})
current_articles = [] if arts is None:
self.log('\tFound section:', current_section) continue
if current_section is not None and x.name == 'strong': self.log('Found section:', gtitle)
title = self.tag_to_string(x) articles = []
p = x.parent.parent.find('a', href=lambda x: x and '/HTMLSTART' in x) for art in arts.findAll(attrs={'class':lambda x: x and 'tocArticle'
if p is None: in x}):
continue a = art.find('a', href=True)
url = p.get('href', False) if a is None:
if not url or not title:
continue continue
url = a.get('href')
if url.startswith('/'): if url.startswith('/'):
url = 'http://www3.interscience.wiley.com'+url url = 'http://onlinelibrary.wiley.com' + url
url = url.replace('/HTMLSTART', '/main.html,ftx_abs') url = url.replace('/abstract', '/full')
self.log('\t\tFound article:', title) title = self.tag_to_string(a)
self.log('\t\t\t', url) a.extract()
#if url.startswith('/'): pm = art.find(attrs={'class':'productMenu'})
#url = 'http://online.wsj.com'+url if pm is not None:
current_articles.append({'title': title, 'url':url, pm.extract()
'description':'', 'date':''}) desc = self.tag_to_string(art)
self.log('\tFound article:', title, 'at', url)
if current_articles and current_section: articles.append({'title':title, 'url':url, 'description':desc,
feeds.append((current_section, current_articles)) 'date':''})
if articles:
feeds.append((gtitle, articles))
return feeds return feeds
def preprocess_html(self, soup):
for img in soup.findAll('img', src=True):
img['src'] = img['src'].replace('tfig', 'nfig')
return soup

View File

@ -4,7 +4,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
class NYTimes(BasicNewsRecipe): class NYTimes(BasicNewsRecipe):
title = 'New England Journal of Medicine' title = 'New England Journal of Medicine'
__author__ = 'Krittika Goyal' __author__ = 'Kovid Goyal'
description = 'Medical news' description = 'Medical news'
timefmt = ' [%d %b, %Y]' timefmt = ' [%d %b, %Y]'
needs_subscription = True needs_subscription = True

View File

@ -36,6 +36,16 @@ Install BeautifulSoup 3.0.x manually into site-packages (3.1.x parses broken HTM
Install pywin32 and edit win32com\__init__.py setting _frozen = True and Install pywin32 and edit win32com\__init__.py setting _frozen = True and
__gen_path__ to a temp dir (otherwise it tries to set it to a dir in the install tree which leads to permission errors) __gen_path__ to a temp dir (otherwise it tries to set it to a dir in the install tree which leads to permission errors)
Note that you should use::
import tempfile
__gen_path__ = os.path.join(
tempfile.gettempdir(), "gen_py",
"%d.%d" % (sys.version_info[0], sys.version_info[1]))
Use gettempdir instead of the win32 api method as gettempdir returns a temp dir that is guaranteed to actually work.
Also edit win32com\client\gencache.py and change the except IOError on line 57 to catch all exceptions. Also edit win32com\client\gencache.py and change the except IOError on line 57 to catch all exceptions.
SQLite SQLite

View File

@ -28,7 +28,7 @@ class ANDROID(USBMS):
# Motorola # Motorola
0x22b8 : { 0x41d9 : [0x216], 0x2d67 : [0x100], 0x41db : [0x216], 0x22b8 : { 0x41d9 : [0x216], 0x2d67 : [0x100], 0x41db : [0x216],
0x4285 : [0x216]}, 0x4285 : [0x216], 0x42a3 : [0x216] },
# Sony Ericsson # Sony Ericsson
0xfce : { 0xd12e : [0x0100]}, 0xfce : { 0xd12e : [0x0100]},

View File

@ -62,9 +62,9 @@ class SWEEX(USBMS):
# Ordered list of supported formats # Ordered list of supported formats
FORMATS = ['epub', 'prc', 'fb2', 'html', 'rtf', 'chm', 'pdf', 'txt'] FORMATS = ['epub', 'prc', 'fb2', 'html', 'rtf', 'chm', 'pdf', 'txt']
VENDOR_ID = [0x0525] VENDOR_ID = [0x0525, 0x177f]
PRODUCT_ID = [0xa4a5] PRODUCT_ID = [0xa4a5, 0x300]
BCD = [0x0319] BCD = [0x0319, 0x110]
VENDOR_NAME = 'SWEEX' VENDOR_NAME = 'SWEEX'
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'EBOOKREADER' WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'EBOOKREADER'

View File

@ -27,13 +27,10 @@ class FB2MLizer(object):
''' '''
Todo: * Include more FB2 specific tags in the conversion. Todo: * Include more FB2 specific tags in the conversion.
* Handle a tags. * Handle a tags.
* Figure out some way to turn oeb_book.toc items into <section><title>
<p> to allow for readers to generate toc from the document.
''' '''
def __init__(self, log): def __init__(self, log):
self.log = log self.log = log
self.image_hrefs = {}
self.reset_state() self.reset_state()
def reset_state(self): def reset_state(self):
@ -43,17 +40,25 @@ class FB2MLizer(object):
# in different directories. FB2 images are all in a flat layout so we rename all images # in different directories. FB2 images are all in a flat layout so we rename all images
# into a sequential numbering system to ensure there are no collisions between image names. # into a sequential numbering system to ensure there are no collisions between image names.
self.image_hrefs = {} self.image_hrefs = {}
# Mapping of toc items and their
self.toc = {}
# Used to see whether a new <section> needs to be opened
self.section_level = 0
def extract_content(self, oeb_book, opts): def extract_content(self, oeb_book, opts):
self.log.info('Converting XHTML to FB2 markup...') self.log.info('Converting XHTML to FB2 markup...')
self.oeb_book = oeb_book self.oeb_book = oeb_book
self.opts = opts self.opts = opts
self.reset_state()
# Used for adding <section>s and <title>s to allow readers
# to generate toc from the document.
if self.opts.sectionize == 'toc':
self.create_flat_toc(self.oeb_book.toc, 1)
return self.fb2mlize_spine() return self.fb2mlize_spine()
def fb2mlize_spine(self): def fb2mlize_spine(self):
self.reset_state()
output = [self.fb2_header()] output = [self.fb2_header()]
output.append(self.get_text()) output.append(self.get_text())
output.append(self.fb2mlize_images()) output.append(self.fb2mlize_images())
@ -66,13 +71,19 @@ class FB2MLizer(object):
return u'<?xml version="1.0" encoding="UTF-8"?>' + output return u'<?xml version="1.0" encoding="UTF-8"?>' + output
def clean_text(self, text): def clean_text(self, text):
text = re.sub(r'(?miu)<section>\s*</section>', '', text)
text = re.sub(r'(?miu)\s+</section>', '</section>', text)
text = re.sub(r'(?miu)</section><section>', '</section>\n\n<section>', text)
text = re.sub(r'(?miu)<p>\s*</p>', '', text) text = re.sub(r'(?miu)<p>\s*</p>', '', text)
text = re.sub(r'(?miu)\s+</p>', '</p>', text) text = re.sub(r'(?miu)\s*</p>', '</p>', text)
text = re.sub(r'(?miu)</p><p>', '</p>\n\n<p>', text) text = re.sub(r'(?miu)</p>\s*<p>', '</p>\n\n<p>', text)
text = re.sub(r'(?miu)<title>\s*</title>', '', text)
text = re.sub(r'(?miu)\s+</title>', '</title>', text)
text = re.sub(r'(?miu)<section>\s*</section>', '', text)
text = re.sub(r'(?miu)\s*</section>', '\n</section>', text)
text = re.sub(r'(?miu)</section>\s*', '</section>\n\n', text)
text = re.sub(r'(?miu)\s*<section>', '\n<section>', text)
text = re.sub(r'(?miu)<section>\s*', '<section>\n', text)
text = re.sub(r'(?miu)</section><section>', '</section>\n\n<section>', text)
if self.opts.insert_blank_line: if self.opts.insert_blank_line:
text = re.sub(r'(?miu)</p>', '</p><empty-line />', text) text = re.sub(r'(?miu)</p>', '</p><empty-line />', text)
@ -144,12 +155,34 @@ class FB2MLizer(object):
def get_text(self): def get_text(self):
text = ['<body>'] text = ['<body>']
# Create main section if there are no others to create
if self.opts.sectionize == 'nothing':
text.append('<section>')
self.section_level += 1
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href) self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
text.append('<section>')
# Start a <section> if we must sectionize each file or if the TOC references this page
page_section_open = False
if self.opts.sectionize == 'files' or self.toc.get(item.href) == 'page':
text.append('<section>')
page_section_open = True
self.section_level += 1
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
if page_section_open:
text.append('</section>')
self.section_level -= 1
# Close any open sections
while self.section_level > 0:
text.append('</section>') text.append('</section>')
self.section_level -= 1
return ''.join(text) + '</body>' return ''.join(text) + '</body>'
def fb2mlize_images(self): def fb2mlize_images(self):
@ -184,6 +217,17 @@ class FB2MLizer(object):
'%s.' % (item.href, e)) '%s.' % (item.href, e))
return ''.join(images) return ''.join(images)
def create_flat_toc(self, nodes, level):
for item in nodes:
href, mid, id = item.href.partition('#')
if not id:
self.toc[href] = 'page'
else:
if not self.toc.get(href, None):
self.toc[href] = {}
self.toc[href][id] = level
self.create_flat_toc(item.nodes, level + 1)
def ensure_p(self): def ensure_p(self):
if self.in_p: if self.in_p:
return [], [] return [], []
@ -254,10 +298,38 @@ class FB2MLizer(object):
# First tag in tree # First tag in tree
tag = barename(elem_tree.tag) tag = barename(elem_tree.tag)
# Convert TOC entries to <title>s and add <section>s
if self.opts.sectionize == 'toc':
# A section cannot be a child of any other element than another section,
# so leave the tag alone if there are parents
if not tag_stack:
# There are two reasons to start a new section here: the TOC pointed to
# this page (then we use the first non-<body> on the page as a <title>), or
# the TOC pointed to a specific element
newlevel = 0
toc_entry = self.toc.get(page.href, None)
if toc_entry == 'page':
if tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text:
newlevel = 1
self.toc[page.href] = None
elif toc_entry and elem_tree.attrib.get('id', None):
newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None)
# Start a new section if necessary
if newlevel:
if not (newlevel > self.section_level):
fb2_out.append('</section>')
self.section_level -= 1
fb2_out.append('<section>')
self.section_level += 1
fb2_out.append('<title>')
tags.append('title')
if self.section_level == 0:
# If none of the prior processing made a section, make one now to be FB2 spec compliant
fb2_out.append('<section>')
self.section_level += 1
# Process the XHTML tag if it needs to be converted to an FB2 tag. # Process the XHTML tag if it needs to be converted to an FB2 tag.
if tag == 'h1' and self.opts.h1_to_title or tag == 'h2' and self.opts.h2_to_title or tag == 'h3' and self.opts.h3_to_title:
fb2_out.append('<title>')
tags.append('title')
if tag == 'img': if tag == 'img':
if elem_tree.attrib.get('src', None): if elem_tree.attrib.get('src', None):
# Only write the image tag if it is in the manifest. # Only write the image tag if it is in the manifest.

View File

@ -16,15 +16,15 @@ class FB2Output(OutputFormatPlugin):
file_type = 'fb2' file_type = 'fb2'
options = set([ options = set([
OptionRecommendation(name='h1_to_title', OptionRecommendation(name='sectionize',
recommended_value=False, level=OptionRecommendation.LOW, recommended_value='files', level=OptionRecommendation.LOW,
help=_('Wrap all h1 tags with fb2 title elements.')), choices=['toc', 'files', 'nothing'],
OptionRecommendation(name='h2_to_title', help=_('Specify the sectionization of elements. '
recommended_value=False, level=OptionRecommendation.LOW, 'A value of "nothing" turns the book into a single section. '
help=_('Wrap all h2 tags with fb2 title elements.')), 'A value of "files" turns each file into a separate section; use this if your device is having trouble. '
OptionRecommendation(name='h3_to_title', 'A value of "Table of Contents" turns the entries in the Table of Contents into titles and creates sections; '
recommended_value=False, level=OptionRecommendation.LOW, 'if it fails, adjust the "Structure Detection" and/or "Table of Contents" settings '
help=_('Wrap all h3 tags with fb2 title elements.')), '(turn on "Force use of auto-generated Table of Contents).')),
]) ])
def convert(self, oeb_book, output_path, input_plugin, opts, log): def convert(self, oeb_book, output_path, input_plugin, opts, log):

View File

@ -245,7 +245,7 @@ class RTFInput(InputFormatPlugin):
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
self.options = options self.opts = options
self.log = log self.log = log
self.log('Converting RTF to XML...') self.log('Converting RTF to XML...')
#Name of the preprocesssed RTF file #Name of the preprocesssed RTF file
@ -290,12 +290,12 @@ class RTFInput(InputFormatPlugin):
res = transform.tostring(result) res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
if not getattr(self.options, 'remove_paragraph_spacing', False): if not getattr(self.opts, 'remove_paragraph_spacing', False):
res = re.sub('\s*<body>', '<body>', res) res = re.sub('\s*<body>', '<body>', res)
res = re.sub('(?<=\n)\n{2}', res = re.sub('(?<=\n)\n{2}',
u'<p>\u00a0</p>\n'.encode('utf-8'), res) u'<p>\u00a0</p>\n'.encode('utf-8'), res)
if self.options.preprocess_html: if self.opts.preprocess_html:
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
res = preprocessor(res) res = preprocessor(res)
f.write(res) f.write(res)
self.write_inline_css(inline_class, border_styles) self.write_inline_css(inline_class, border_styles)

View File

@ -17,6 +17,8 @@ class PluginWidget(Widget, Ui_Form):
ICON = I('mimetypes/fb2.png') ICON = I('mimetypes/fb2.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, ['h1_to_title', 'h2_to_title', 'h3_to_title']) Widget.__init__(self, parent, ['sectionize'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
for x in ('toc', 'files', 'nothing'):
self.opt_sectionize.addItem(x)
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -14,7 +14,7 @@
<string>Form</string> <string>Form</string>
</property> </property>
<layout class="QGridLayout" name="gridLayout"> <layout class="QGridLayout" name="gridLayout">
<item row="3" column="0"> <item row="1" column="0">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -28,23 +28,19 @@
</spacer> </spacer>
</item> </item>
<item row="0" column="0"> <item row="0" column="0">
<widget class="QCheckBox" name="opt_h1_to_title"> <widget class="QLabel" name="label">
<property name="text"> <property name="text">
<string>Wrap h1 tags with &lt;title&gt; elements</string> <string>Sectionize:</string>
</property> </property>
</widget> <property name="buddy">
</item> <cstring>opt_sectionize</cstring>
<item row="1" column="0"> </property>
<widget class="QCheckBox" name="opt_h2_to_title"> </widget>
<property name="text"> </item>
<string>Wrap h2 tags with &lt;title&gt; elements</string> <item row="0" column="1">
</property> <widget class="QComboBox" name="opt_sectionize">
</widget> <property name="minimumContentsLength">
</item> <number>20</number>
<item row="2" column="0">
<widget class="QCheckBox" name="opt_h3_to_title">
<property name="text">
<string>Wrap h3 tags with &lt;title&gt; elements</string>
</property> </property>
</widget> </widget>
</item> </item>