Merge upstream changes

This commit is contained in:
Marshall T. Vandegrift 2009-01-19 22:54:27 -05:00
commit fd389eeca2
38 changed files with 13786 additions and 6917 deletions

File diff suppressed because one or more lines are too long

View File

@ -146,36 +146,7 @@ class PRS505(Device):
self._card_prefix = re.search(card_pat, mount).group(2) + os.sep
def open_windows_nowmi(self):
from calibre import plugins
winutil = plugins['winutil'][0]
volumes = winutil.get_mounted_volumes_for_usb_device(self.VENDOR_ID, self.PRODUCT_ID)
main = None
for device_id in volumes.keys():
if 'PRS-505/UC&' in device_id:
main = volumes[device_id]+':\\'
if not main:
raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%self.__class__.__name__)
self._main_prefix = main
card = self._card_prefix = None
win32api = __import__('win32api')
for device_id in volumes.keys():
if 'PRS-505/UC:' in device_id:
card = volumes[device_id]+':\\'
try:
win32api.GetVolumeInformation(card)
self._card_prefix = card
break
except:
continue
def open_windows(self):
try:
self.open_windows_nowmi()
return
except:
pass
drives = []
wmi = __import__('wmi', globals(), locals(), [], -1)
c = wmi.WMI()

View File

@ -156,7 +156,7 @@ to auto-generate a Table of Contents.
help=_('Set the right margin in pts. Default is %default'))
layout('base_font_size2', ['--base-font-size'], default=12.0,
help=_('The base font size in pts. Default is %defaultpt. Set to 0 to disable rescaling of fonts.'))
layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=True,
layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=False,
help=_('Remove spacing between paragraphs. Will not work if the source file forces inter-paragraph spacing.'))
layout('preserve_tag_structure', ['--preserve-tag-structure'], default=False,
help=_('Preserve the HTML tag structure while splitting large HTML files. This is only neccessary if the HTML files contain CSS that uses sibling selectors. Enabling this greatly slows down processing of large HTML files.'))

View File

@ -52,6 +52,7 @@ def convert(opts, recipe_arg, notification=None):
print 'Generating epub...'
opts.encoding = 'utf-8'
opts.remove_paragraph_spacing = True
html2epub(opf, opts, notification=notification)

View File

@ -128,6 +128,8 @@ class HTMLProcessor(Processor, Rationalizer):
if hasattr(self.body, 'xpath'):
for script in list(self.body.xpath('descendant::script')):
script.getparent().remove(script)
self.fix_markup()
def convert_image(self, img):
rpath = img.get('src', '')
@ -145,6 +147,17 @@ class HTMLProcessor(Processor, Rationalizer):
if val == rpath:
self.resource_map[key] = rpath+'_calibre_converted.jpg'
img.set('src', rpath+'_calibre_converted.jpg')
def fix_markup(self):
'''
Perform various markup transforms to get the output to render correctly
in the quirky ADE.
'''
# Replace <br> that are children of <body> with <p>&nbsp;</p>
if hasattr(self.body, 'xpath'):
for br in self.body.xpath('./br'):
br.tag = 'p'
br.text = u'\u00a0'
def save(self):
for meta in list(self.root.xpath('//meta')):

View File

@ -95,7 +95,7 @@ class EbookIterator(object):
for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css):
block = match.group(1)
family = re.compile(r'font-family\s*:\s*([^;]+)').search(block)
url = re.compile(r'url\s*\((.+?)\)', re.DOTALL).search(block)
url = re.compile(r'url\s*\([\'"]*(.+?)[\'"]*\)', re.DOTALL).search(block)
if url:
path = url.group(1).split('/')
path = os.path.join(os.path.dirname(item.path), *path)

View File

@ -848,7 +848,7 @@ class Processor(Parser):
# Workaround for anchor rendering bug in ADE
css += '\n\na { color: inherit; text-decoration: inherit; cursor: default; }\na[href] { color: blue; text-decoration: underline; cursor:pointer; }'
if self.opts.remove_paragraph_spacing:
css += '\n\np {text-indent: 2em; margin-top:0pt; margin-bottom:0pt; padding:0pt; border:0pt;}'
css += '\n\np {text-indent: 1.5em; margin-top:0pt; margin-bottom:0pt; padding:0pt; border:0pt;}'
if self.opts.override_css:
css += '\n\n' + self.opts.override_css
self.override_css = self.css_parser.parseString(self.preprocess_css(css))

View File

@ -12,7 +12,7 @@ import copy
import re
from lxml import etree
from calibre.ebooks.oeb.base import namespace, barename
from calibre.ebooks.oeb.base import XHTML, XHTML_NS
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, OEB_DOCS
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
@ -96,8 +96,11 @@ class MobiMLizer(object):
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
oeb.manifest.remove(item)
if item.spine_position is not None:
oeb.spine.remove(item)
if item.media_type in OEB_DOCS:
self.oeb.manifest.remove(item)
def mobimlize_spine(self):
for item in self.oeb.spine:
stylizer = Stylizer(item.data, item.href, self.oeb, self.profile)
@ -137,7 +140,7 @@ class MobiMLizer(object):
para = bstate.para
if tag in SPECIAL_TAGS and not text:
para = para if para is not None else bstate.body
elif para is None:
elif para is None or tag in ('td', 'th'):
body = bstate.body
if bstate.pbreak:
etree.SubElement(body, MBP('pagebreak'))
@ -157,7 +160,8 @@ class MobiMLizer(object):
elif indent != 0 and abs(indent) < self.profile.fbase:
indent = (indent / abs(indent)) * self.profile.fbase
if tag in NESTABLE_TAGS:
para = wrapper = etree.SubElement(parent, XHTML(tag))
para = wrapper = etree.SubElement(
parent, XHTML(tag), attrib=istate.attrib)
bstate.nested.append(para)
if tag == 'li' and len(istates) > 1:
istates[-2].list_num += 1
@ -337,6 +341,10 @@ class MobiMLizer(object):
tag = 'tr'
elif display == 'table-cell':
tag = 'td'
if tag in TABLE_TAGS:
for attr in ('rowspan', 'colspan'):
if attr in elem.attrib:
istate.attrib[attr] = elem.attrib[attr]
text = None
if elem.text:
if istate.preserve:
@ -374,6 +382,6 @@ class MobiMLizer(object):
bstate.vpadding += bstate.vmargin
bstate.vmargin = 0
bstate.vpadding += vpadding
if tag in NESTABLE_TAGS and bstate.nested:
if bstate.nested and bstate.nested[-1].tag == elem.tag:
bstate.nested.pop()
istates.pop()

View File

@ -124,6 +124,7 @@ class BookHeader(object):
sublangid = (langcode >> 10) & 0xFF
self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None
@ -441,17 +442,18 @@ class MobiReader(object):
os.makedirs(output_dir)
image_index = 0
self.image_names = []
for i in range(self.num_sections):
for i in range(self.book_header.first_image_index, self.num_sections):
if i in processed_records:
continue
processed_records.append(i)
data = self.sections[i][0]
buf = cStringIO.StringIO(data)
image_index += 1
try:
im = PILImage.open(buf)
except IOError:
except IOError, e:
continue
image_index += 1
path = os.path.join(output_dir, '%05d.jpg'%image_index)
self.image_names.append(os.path.basename(path))
im.convert('RGB').save(open(path, 'wb'), format='JPEG')
@ -476,6 +478,7 @@ def get_metadata(stream):
else:
tdir = tempfile.mkdtemp('_mobi_meta', __appname__+'_')
atexit.register(shutil.rmtree, tdir)
#print tdir
mr.extract_images([], tdir)
mi = mr.create_opf('dummy.html')
if mi.cover:
@ -491,7 +494,6 @@ def get_metadata(stream):
if os.access(candidate, os.R_OK):
cover = candidate
break
if os.access(cover, os.R_OK):
mi.cover_data = ('JPEG', open(os.path.join(tdir, cover), 'rb').read())
else:

View File

@ -95,6 +95,7 @@ class Serializer(object):
def __init__(self, oeb, images):
self.oeb = oeb
self.images = images
self.logger = oeb.logger
self.id_offsets = {}
self.href_offsets = defaultdict(list)
self.breaks = []
@ -144,8 +145,8 @@ class Serializer(object):
item = hrefs[path] if path else None
if item and item.spine_position is None:
return False
id = item.id if item else base.id
href = '#'.join((id, frag)) if frag else id
path = item.href if item else base.href
href = '#'.join((path, frag)) if frag else path
buffer.write('filepos=')
self.href_offsets[href].append(buffer.tell())
buffer.write('0000000000')
@ -170,7 +171,7 @@ class Serializer(object):
buffer = self.buffer
if not item.linear:
self.breaks.append(buffer.tell() - 1)
self.id_offsets[item.id] = buffer.tell()
self.id_offsets[item.href] = buffer.tell()
for elem in item.data.find(XHTML('body')):
self.serialize_elem(elem, item)
buffer.write('<mbp:pagebreak/>')
@ -180,12 +181,11 @@ class Serializer(object):
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) not in nsrmap:
return
hrefs = self.oeb.manifest.hrefs
tag = prefixname(elem.tag, nsrmap)
for attr in ('name', 'id'):
if attr in elem.attrib:
id = '#'.join((item.id, elem.attrib[attr]))
self.id_offsets[id] = buffer.tell()
href = '#'.join((item.href, elem.attrib[attr]))
self.id_offsets[href] = buffer.tell()
del elem.attrib[attr]
if tag == 'a' and not elem.attrib \
and not len(elem) and not elem.text:
@ -203,7 +203,7 @@ class Serializer(object):
continue
elif attr == 'src':
href = item.abshref(val)
if href in hrefs:
if href in self.images:
index = self.images[href]
buffer.write('recindex="%05d"' % index)
continue
@ -233,8 +233,12 @@ class Serializer(object):
def fixup_links(self):
buffer = self.buffer
for id, hoffs in self.href_offsets.items():
ioff = self.id_offsets[id]
id_offsets = self.id_offsets
for href, hoffs in self.href_offsets.items():
if href not in id_offsets:
self.logger.warn('Hyperlink target %r not found' % href)
href, _ = urldefrag(href)
ioff = self.id_offsets[href]
for hoff in hoffs:
buffer.seek(hoff)
buffer.write('%010d' % ioff)
@ -360,7 +364,11 @@ class MobiWriter(object):
if image.format not in ('JPEG', 'GIF'):
width, height = image.size
area = width * height
format = 'GIF' if area <= 40000 else 'JPEG'
if area <= 40000:
format = 'GIF'
else:
image = image.convert('RGBA')
format = 'JPEG'
changed = True
if dimen is not None:
image.thumbnail(dimen, Image.ANTIALIAS)

View File

@ -500,6 +500,7 @@ class Spine(object):
self.items.pop(index)
for i in xrange(index, len(self.items)):
self.items[i].spine_position = i
item.spine_position = None
def __iter__(self):
for item in self.items:
@ -796,12 +797,20 @@ class OEBBook(object):
def _manifest_from_opf(self, opf):
self.manifest = manifest = Manifest(self)
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
id = elem.get('id')
href = elem.get('href')
media_type = elem.get('media-type')
fallback = elem.get('fallback')
if href in manifest.hrefs:
self.logger.warn(u'Duplicate manifest entry for %r.' % href)
continue
if not self.container.exists(href):
self.logger.warn(u'Manifest item %r not found.' % href)
continue
manifest.add(elem.get('id'), href, elem.get('media-type'),
elem.get('fallback'))
if id in manifest.ids:
self.logger.warn(u'Duplicate manifest id %r.' % id)
id, href = manifest.generate(id, href)
manifest.add(id, href, media_type, fallback)
def _spine_from_opf(self, opf):
self.spine = spine = Spine(self)

View File

@ -41,8 +41,9 @@ class ManifestTrimmer(object):
while unchecked:
new = set()
for item in unchecked:
if item.media_type in OEB_DOCS or \
item.media_type[-4:] in ('/xml', '+xml'):
if (item.media_type in OEB_DOCS or
item.media_type[-4:] in ('/xml', '+xml')) and \
item.data is not None:
hrefs = [sel(item.data) for sel in LINK_SELECTORS]
for href in chain(*hrefs):
href = item.abshref(href)

View File

@ -309,18 +309,7 @@ class Main(MainWindow, Ui_MainWindow):
self.library_path = dir
db = LibraryDatabase2(self.library_path)
self.library_view.set_database(db)
if self.olddb is not None:
pd = QProgressDialog('', '', 0, 100, self)
pd.setWindowModality(Qt.ApplicationModal)
pd.setCancelButton(None)
pd.setWindowTitle(_('Migrating database'))
pd.show()
number_of_books = db.migrate_old(self.olddb, pd)
self.olddb.close()
if number_of_books == 0:
os.remove(self.olddb.dbpath)
self.olddb = None
prefs['library_path'] = self.library_path
prefs['library_path'] = self.library_path
self.library_view.sortByColumn(*dynamic.get('sort_column', ('timestamp', Qt.DescendingOrder)))
if not self.library_view.restore_column_widths():
self.library_view.resizeColumnsToContents()
@ -1392,39 +1381,14 @@ class Main(MainWindow, Ui_MainWindow):
def initialize_database(self):
self.library_path = prefs['library_path']
self.olddb = None
if self.library_path is None: # Need to migrate to new database layout
QMessageBox.information(self, 'Database format changed',
'''\
<p>calibre's book storage format has changed. Instead of storing book files in a database, the
files are now stored in a folder on your filesystem. You will now be asked to choose the folder
in which you want to store your books files. Any existing books will be automatically migrated.
''')
self.database_path = prefs['database_path']
if not os.access(os.path.dirname(self.database_path), os.W_OK):
error_dialog(self, _('Database does not exist'),
_('The directory in which the database should be: %s no longer exists. Please choose a new database location.')%self.database_path).exec_()
self.database_path = choose_dir(self, 'database path dialog',
_('Choose new location for database'))
if not self.database_path:
self.database_path = os.path.expanduser('~').decode(sys.getfilesystemencoding())
if not os.path.exists(self.database_path):
os.makedirs(self.database_path)
self.database_path = os.path.join(self.database_path, 'library1.db')
prefs['database_path'] = self.database_path
home = os.path.dirname(self.database_path)
if not os.path.exists(home):
home = os.getcwd()
dir = unicode(QFileDialog.getExistingDirectory(self,
_('Choose a location for your ebook library.'), home))
_('Choose a location for your ebook library.'), os.getcwd()))
if not dir:
dir = os.path.dirname(self.database_path)
dir = os.path.expanduser('~/Library')
self.library_path = os.path.abspath(dir)
try:
self.olddb = LibraryDatabase(self.database_path)
except:
traceback.print_exc()
self.olddb = None
if not os.path.exists(self.library_path):
os.makedirs(self.library_path)
def read_settings(self):

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,8 @@ recipe_modules = ['recipe_' + r for r in (
'time_magazine', 'endgadget', 'fudzilla', 'nspm_int', 'nspm', 'pescanik',
'spiegel_int', 'themarketticker', 'tomshardware', 'xkcd', 'ftd', 'zdnet',
'joelonsoftware', 'telepolis', 'common_dreams', 'nin', 'tomshardware_de',
'pagina12', 'infobae', 'ambito', 'elargentino', 'sueddeutsche',
'pagina12', 'infobae', 'ambito', 'elargentino', 'sueddeutsche', 'the_age',
'laprensa',
)]
import re, imp, inspect, time, os

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
laprensa.com.ar
'''
import urllib
from calibre.web.feeds.news import BasicNewsRecipe
class LaPrensa(BasicNewsRecipe):
title = 'La Prensa'
__author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
cover_url = 'http://www.laprensa.com.ar/imgs/logo.gif'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Argentina'
, '--publisher' , title
]
feeds = [
(u'Politica' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=4' )
,(u'Economia' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=5' )
,(u'Opinion' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=6' )
,(u'El Mundo' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=7' )
,(u'Actualidad' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=8' )
,(u'Deportes' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=9' )
,(u'Espectaculos', u'http://www.laprensa.com.ar/Rss.aspx?Rss=10')
]
def print_version(self, url):
return url.replace('.note.aspx','.NotePrint.note.aspx')
def get_article_url(self, article):
raw = article.get('link', None).encode('utf8')
final = urllib.quote(raw,':/')
return final
def preprocess_html(self, soup):
del soup.body['onload']
return soup

View File

@ -0,0 +1,55 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Matthew Briggs <hal.sulphur@gmail.com>'
__docformat__ = 'restructuredtext en'
'''
theage.com.au
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class TheAge(BasicNewsRecipe):
title = 'The Age'
description = 'Business News, World News and Breaking News in Melbourne, Australia'
__author__ = 'Matthew Briggs'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.set_handle_refresh(False)
return br
def parse_index(self):
soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read())
feeds, articles = [], []
feed = None
for tag in soup.findAll(['h3', 'a']):
if tag.name == 'h3':
if articles:
feeds.append((feed, articles))
articles = []
feed = self.tag_to_string(tag)
elif feed is not None and tag.has_key('href') and tag['href'].strip():
url = tag['href'].strip()
if url.startswith('/'):
url = 'http://www.theage.com.au' + url
title = self.tag_to_string(tag)
articles.append({
'title': title,
'url' : url,
'date' : strftime('%a, %d %b'),
'description' : '',
'content' : '',
})
return feeds

View File

@ -398,7 +398,7 @@ class RecursiveFetcher(object, LoggingInterface):
_fname = basename(iurl)
if not isinstance(_fname, unicode):
_fname.decode('latin1', 'replace')
_fname.encode('ascii', 'replace').replace('%', '')
_fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '')
res = os.path.join(linkdiskpath, _fname)
self.downloaded_paths.append(res)
self.filemap[nurl] = res