Merge upstream changes

This commit is contained in:
Marshall T. Vandegrift 2009-01-19 22:54:27 -05:00
commit fd389eeca2
38 changed files with 13786 additions and 6917 deletions

File diff suppressed because one or more lines are too long

View File

@ -146,36 +146,7 @@ class PRS505(Device):
self._card_prefix = re.search(card_pat, mount).group(2) + os.sep self._card_prefix = re.search(card_pat, mount).group(2) + os.sep
def open_windows_nowmi(self):
from calibre import plugins
winutil = plugins['winutil'][0]
volumes = winutil.get_mounted_volumes_for_usb_device(self.VENDOR_ID, self.PRODUCT_ID)
main = None
for device_id in volumes.keys():
if 'PRS-505/UC&' in device_id:
main = volumes[device_id]+':\\'
if not main:
raise DeviceError(_('Unable to detect the %s disk drive. Try rebooting.')%self.__class__.__name__)
self._main_prefix = main
card = self._card_prefix = None
win32api = __import__('win32api')
for device_id in volumes.keys():
if 'PRS-505/UC:' in device_id:
card = volumes[device_id]+':\\'
try:
win32api.GetVolumeInformation(card)
self._card_prefix = card
break
except:
continue
def open_windows(self): def open_windows(self):
try:
self.open_windows_nowmi()
return
except:
pass
drives = [] drives = []
wmi = __import__('wmi', globals(), locals(), [], -1) wmi = __import__('wmi', globals(), locals(), [], -1)
c = wmi.WMI() c = wmi.WMI()

View File

@ -156,7 +156,7 @@ to auto-generate a Table of Contents.
help=_('Set the right margin in pts. Default is %default')) help=_('Set the right margin in pts. Default is %default'))
layout('base_font_size2', ['--base-font-size'], default=12.0, layout('base_font_size2', ['--base-font-size'], default=12.0,
help=_('The base font size in pts. Default is %defaultpt. Set to 0 to disable rescaling of fonts.')) help=_('The base font size in pts. Default is %defaultpt. Set to 0 to disable rescaling of fonts.'))
layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=True, layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=False,
help=_('Remove spacing between paragraphs. Will not work if the source file forces inter-paragraph spacing.')) help=_('Remove spacing between paragraphs. Will not work if the source file forces inter-paragraph spacing.'))
layout('preserve_tag_structure', ['--preserve-tag-structure'], default=False, layout('preserve_tag_structure', ['--preserve-tag-structure'], default=False,
help=_('Preserve the HTML tag structure while splitting large HTML files. This is only neccessary if the HTML files contain CSS that uses sibling selectors. Enabling this greatly slows down processing of large HTML files.')) help=_('Preserve the HTML tag structure while splitting large HTML files. This is only neccessary if the HTML files contain CSS that uses sibling selectors. Enabling this greatly slows down processing of large HTML files.'))

View File

@ -52,6 +52,7 @@ def convert(opts, recipe_arg, notification=None):
print 'Generating epub...' print 'Generating epub...'
opts.encoding = 'utf-8' opts.encoding = 'utf-8'
opts.remove_paragraph_spacing = True
html2epub(opf, opts, notification=notification) html2epub(opf, opts, notification=notification)

View File

@ -128,6 +128,8 @@ class HTMLProcessor(Processor, Rationalizer):
if hasattr(self.body, 'xpath'): if hasattr(self.body, 'xpath'):
for script in list(self.body.xpath('descendant::script')): for script in list(self.body.xpath('descendant::script')):
script.getparent().remove(script) script.getparent().remove(script)
self.fix_markup()
def convert_image(self, img): def convert_image(self, img):
rpath = img.get('src', '') rpath = img.get('src', '')
@ -145,6 +147,17 @@ class HTMLProcessor(Processor, Rationalizer):
if val == rpath: if val == rpath:
self.resource_map[key] = rpath+'_calibre_converted.jpg' self.resource_map[key] = rpath+'_calibre_converted.jpg'
img.set('src', rpath+'_calibre_converted.jpg') img.set('src', rpath+'_calibre_converted.jpg')
def fix_markup(self):
'''
Perform various markup transforms to get the output to render correctly
in the quirky ADE.
'''
# Replace <br> that are children of <body> with <p>&nbsp;</p>
if hasattr(self.body, 'xpath'):
for br in self.body.xpath('./br'):
br.tag = 'p'
br.text = u'\u00a0'
def save(self): def save(self):
for meta in list(self.root.xpath('//meta')): for meta in list(self.root.xpath('//meta')):

View File

@ -95,7 +95,7 @@ class EbookIterator(object):
for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css): for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css):
block = match.group(1) block = match.group(1)
family = re.compile(r'font-family\s*:\s*([^;]+)').search(block) family = re.compile(r'font-family\s*:\s*([^;]+)').search(block)
url = re.compile(r'url\s*\((.+?)\)', re.DOTALL).search(block) url = re.compile(r'url\s*\([\'"]*(.+?)[\'"]*\)', re.DOTALL).search(block)
if url: if url:
path = url.group(1).split('/') path = url.group(1).split('/')
path = os.path.join(os.path.dirname(item.path), *path) path = os.path.join(os.path.dirname(item.path), *path)

View File

@ -848,7 +848,7 @@ class Processor(Parser):
# Workaround for anchor rendering bug in ADE # Workaround for anchor rendering bug in ADE
css += '\n\na { color: inherit; text-decoration: inherit; cursor: default; }\na[href] { color: blue; text-decoration: underline; cursor:pointer; }' css += '\n\na { color: inherit; text-decoration: inherit; cursor: default; }\na[href] { color: blue; text-decoration: underline; cursor:pointer; }'
if self.opts.remove_paragraph_spacing: if self.opts.remove_paragraph_spacing:
css += '\n\np {text-indent: 2em; margin-top:0pt; margin-bottom:0pt; padding:0pt; border:0pt;}' css += '\n\np {text-indent: 1.5em; margin-top:0pt; margin-bottom:0pt; padding:0pt; border:0pt;}'
if self.opts.override_css: if self.opts.override_css:
css += '\n\n' + self.opts.override_css css += '\n\n' + self.opts.override_css
self.override_css = self.css_parser.parseString(self.preprocess_css(css)) self.override_css = self.css_parser.parseString(self.preprocess_css(css))

View File

@ -12,7 +12,7 @@ import copy
import re import re
from lxml import etree from lxml import etree
from calibre.ebooks.oeb.base import namespace, barename from calibre.ebooks.oeb.base import namespace, barename
from calibre.ebooks.oeb.base import XHTML, XHTML_NS from calibre.ebooks.oeb.base import XHTML, XHTML_NS, OEB_DOCS
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.oeb.transforms.flatcss import KeyMapper from calibre.ebooks.oeb.transforms.flatcss import KeyMapper
@ -96,8 +96,11 @@ class MobiMLizer(object):
href = oeb.guide['cover'].href href = oeb.guide['cover'].href
del oeb.guide['cover'] del oeb.guide['cover']
item = oeb.manifest.hrefs[href] item = oeb.manifest.hrefs[href]
oeb.manifest.remove(item) if item.spine_position is not None:
oeb.spine.remove(item)
if item.media_type in OEB_DOCS:
self.oeb.manifest.remove(item)
def mobimlize_spine(self): def mobimlize_spine(self):
for item in self.oeb.spine: for item in self.oeb.spine:
stylizer = Stylizer(item.data, item.href, self.oeb, self.profile) stylizer = Stylizer(item.data, item.href, self.oeb, self.profile)
@ -137,7 +140,7 @@ class MobiMLizer(object):
para = bstate.para para = bstate.para
if tag in SPECIAL_TAGS and not text: if tag in SPECIAL_TAGS and not text:
para = para if para is not None else bstate.body para = para if para is not None else bstate.body
elif para is None: elif para is None or tag in ('td', 'th'):
body = bstate.body body = bstate.body
if bstate.pbreak: if bstate.pbreak:
etree.SubElement(body, MBP('pagebreak')) etree.SubElement(body, MBP('pagebreak'))
@ -157,7 +160,8 @@ class MobiMLizer(object):
elif indent != 0 and abs(indent) < self.profile.fbase: elif indent != 0 and abs(indent) < self.profile.fbase:
indent = (indent / abs(indent)) * self.profile.fbase indent = (indent / abs(indent)) * self.profile.fbase
if tag in NESTABLE_TAGS: if tag in NESTABLE_TAGS:
para = wrapper = etree.SubElement(parent, XHTML(tag)) para = wrapper = etree.SubElement(
parent, XHTML(tag), attrib=istate.attrib)
bstate.nested.append(para) bstate.nested.append(para)
if tag == 'li' and len(istates) > 1: if tag == 'li' and len(istates) > 1:
istates[-2].list_num += 1 istates[-2].list_num += 1
@ -337,6 +341,10 @@ class MobiMLizer(object):
tag = 'tr' tag = 'tr'
elif display == 'table-cell': elif display == 'table-cell':
tag = 'td' tag = 'td'
if tag in TABLE_TAGS:
for attr in ('rowspan', 'colspan'):
if attr in elem.attrib:
istate.attrib[attr] = elem.attrib[attr]
text = None text = None
if elem.text: if elem.text:
if istate.preserve: if istate.preserve:
@ -374,6 +382,6 @@ class MobiMLizer(object):
bstate.vpadding += bstate.vmargin bstate.vpadding += bstate.vmargin
bstate.vmargin = 0 bstate.vmargin = 0
bstate.vpadding += vpadding bstate.vpadding += vpadding
if tag in NESTABLE_TAGS and bstate.nested: if bstate.nested and bstate.nested[-1].tag == elem.tag:
bstate.nested.pop() bstate.nested.pop()
istates.pop() istates.pop()

View File

@ -124,6 +124,7 @@ class BookHeader(object):
sublangid = (langcode >> 10) & 0xFF sublangid = (langcode >> 10) & 0xFF
self.language = main_language.get(langid, 'ENGLISH') self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84]) self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None self.exth = None
@ -441,17 +442,18 @@ class MobiReader(object):
os.makedirs(output_dir) os.makedirs(output_dir)
image_index = 0 image_index = 0
self.image_names = [] self.image_names = []
for i in range(self.num_sections): for i in range(self.book_header.first_image_index, self.num_sections):
if i in processed_records: if i in processed_records:
continue continue
processed_records.append(i) processed_records.append(i)
data = self.sections[i][0] data = self.sections[i][0]
buf = cStringIO.StringIO(data) buf = cStringIO.StringIO(data)
image_index += 1
try: try:
im = PILImage.open(buf) im = PILImage.open(buf)
except IOError: except IOError, e:
continue continue
image_index += 1
path = os.path.join(output_dir, '%05d.jpg'%image_index) path = os.path.join(output_dir, '%05d.jpg'%image_index)
self.image_names.append(os.path.basename(path)) self.image_names.append(os.path.basename(path))
im.convert('RGB').save(open(path, 'wb'), format='JPEG') im.convert('RGB').save(open(path, 'wb'), format='JPEG')
@ -476,6 +478,7 @@ def get_metadata(stream):
else: else:
tdir = tempfile.mkdtemp('_mobi_meta', __appname__+'_') tdir = tempfile.mkdtemp('_mobi_meta', __appname__+'_')
atexit.register(shutil.rmtree, tdir) atexit.register(shutil.rmtree, tdir)
#print tdir
mr.extract_images([], tdir) mr.extract_images([], tdir)
mi = mr.create_opf('dummy.html') mi = mr.create_opf('dummy.html')
if mi.cover: if mi.cover:
@ -491,7 +494,6 @@ def get_metadata(stream):
if os.access(candidate, os.R_OK): if os.access(candidate, os.R_OK):
cover = candidate cover = candidate
break break
if os.access(cover, os.R_OK): if os.access(cover, os.R_OK):
mi.cover_data = ('JPEG', open(os.path.join(tdir, cover), 'rb').read()) mi.cover_data = ('JPEG', open(os.path.join(tdir, cover), 'rb').read())
else: else:

View File

@ -95,6 +95,7 @@ class Serializer(object):
def __init__(self, oeb, images): def __init__(self, oeb, images):
self.oeb = oeb self.oeb = oeb
self.images = images self.images = images
self.logger = oeb.logger
self.id_offsets = {} self.id_offsets = {}
self.href_offsets = defaultdict(list) self.href_offsets = defaultdict(list)
self.breaks = [] self.breaks = []
@ -144,8 +145,8 @@ class Serializer(object):
item = hrefs[path] if path else None item = hrefs[path] if path else None
if item and item.spine_position is None: if item and item.spine_position is None:
return False return False
id = item.id if item else base.id path = item.href if item else base.href
href = '#'.join((id, frag)) if frag else id href = '#'.join((path, frag)) if frag else path
buffer.write('filepos=') buffer.write('filepos=')
self.href_offsets[href].append(buffer.tell()) self.href_offsets[href].append(buffer.tell())
buffer.write('0000000000') buffer.write('0000000000')
@ -170,7 +171,7 @@ class Serializer(object):
buffer = self.buffer buffer = self.buffer
if not item.linear: if not item.linear:
self.breaks.append(buffer.tell() - 1) self.breaks.append(buffer.tell() - 1)
self.id_offsets[item.id] = buffer.tell() self.id_offsets[item.href] = buffer.tell()
for elem in item.data.find(XHTML('body')): for elem in item.data.find(XHTML('body')):
self.serialize_elem(elem, item) self.serialize_elem(elem, item)
buffer.write('<mbp:pagebreak/>') buffer.write('<mbp:pagebreak/>')
@ -180,12 +181,11 @@ class Serializer(object):
if not isinstance(elem.tag, basestring) \ if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) not in nsrmap: or namespace(elem.tag) not in nsrmap:
return return
hrefs = self.oeb.manifest.hrefs
tag = prefixname(elem.tag, nsrmap) tag = prefixname(elem.tag, nsrmap)
for attr in ('name', 'id'): for attr in ('name', 'id'):
if attr in elem.attrib: if attr in elem.attrib:
id = '#'.join((item.id, elem.attrib[attr])) href = '#'.join((item.href, elem.attrib[attr]))
self.id_offsets[id] = buffer.tell() self.id_offsets[href] = buffer.tell()
del elem.attrib[attr] del elem.attrib[attr]
if tag == 'a' and not elem.attrib \ if tag == 'a' and not elem.attrib \
and not len(elem) and not elem.text: and not len(elem) and not elem.text:
@ -203,7 +203,7 @@ class Serializer(object):
continue continue
elif attr == 'src': elif attr == 'src':
href = item.abshref(val) href = item.abshref(val)
if href in hrefs: if href in self.images:
index = self.images[href] index = self.images[href]
buffer.write('recindex="%05d"' % index) buffer.write('recindex="%05d"' % index)
continue continue
@ -233,8 +233,12 @@ class Serializer(object):
def fixup_links(self): def fixup_links(self):
buffer = self.buffer buffer = self.buffer
for id, hoffs in self.href_offsets.items(): id_offsets = self.id_offsets
ioff = self.id_offsets[id] for href, hoffs in self.href_offsets.items():
if href not in id_offsets:
self.logger.warn('Hyperlink target %r not found' % href)
href, _ = urldefrag(href)
ioff = self.id_offsets[href]
for hoff in hoffs: for hoff in hoffs:
buffer.seek(hoff) buffer.seek(hoff)
buffer.write('%010d' % ioff) buffer.write('%010d' % ioff)
@ -360,7 +364,11 @@ class MobiWriter(object):
if image.format not in ('JPEG', 'GIF'): if image.format not in ('JPEG', 'GIF'):
width, height = image.size width, height = image.size
area = width * height area = width * height
format = 'GIF' if area <= 40000 else 'JPEG' if area <= 40000:
format = 'GIF'
else:
image = image.convert('RGBA')
format = 'JPEG'
changed = True changed = True
if dimen is not None: if dimen is not None:
image.thumbnail(dimen, Image.ANTIALIAS) image.thumbnail(dimen, Image.ANTIALIAS)

View File

@ -500,6 +500,7 @@ class Spine(object):
self.items.pop(index) self.items.pop(index)
for i in xrange(index, len(self.items)): for i in xrange(index, len(self.items)):
self.items[i].spine_position = i self.items[i].spine_position = i
item.spine_position = None
def __iter__(self): def __iter__(self):
for item in self.items: for item in self.items:
@ -796,12 +797,20 @@ class OEBBook(object):
def _manifest_from_opf(self, opf): def _manifest_from_opf(self, opf):
self.manifest = manifest = Manifest(self) self.manifest = manifest = Manifest(self)
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
id = elem.get('id')
href = elem.get('href') href = elem.get('href')
media_type = elem.get('media-type')
fallback = elem.get('fallback')
if href in manifest.hrefs:
self.logger.warn(u'Duplicate manifest entry for %r.' % href)
continue
if not self.container.exists(href): if not self.container.exists(href):
self.logger.warn(u'Manifest item %r not found.' % href) self.logger.warn(u'Manifest item %r not found.' % href)
continue continue
manifest.add(elem.get('id'), href, elem.get('media-type'), if id in manifest.ids:
elem.get('fallback')) self.logger.warn(u'Duplicate manifest id %r.' % id)
id, href = manifest.generate(id, href)
manifest.add(id, href, media_type, fallback)
def _spine_from_opf(self, opf): def _spine_from_opf(self, opf):
self.spine = spine = Spine(self) self.spine = spine = Spine(self)

View File

@ -41,8 +41,9 @@ class ManifestTrimmer(object):
while unchecked: while unchecked:
new = set() new = set()
for item in unchecked: for item in unchecked:
if item.media_type in OEB_DOCS or \ if (item.media_type in OEB_DOCS or
item.media_type[-4:] in ('/xml', '+xml'): item.media_type[-4:] in ('/xml', '+xml')) and \
item.data is not None:
hrefs = [sel(item.data) for sel in LINK_SELECTORS] hrefs = [sel(item.data) for sel in LINK_SELECTORS]
for href in chain(*hrefs): for href in chain(*hrefs):
href = item.abshref(href) href = item.abshref(href)

View File

@ -309,18 +309,7 @@ class Main(MainWindow, Ui_MainWindow):
self.library_path = dir self.library_path = dir
db = LibraryDatabase2(self.library_path) db = LibraryDatabase2(self.library_path)
self.library_view.set_database(db) self.library_view.set_database(db)
if self.olddb is not None: prefs['library_path'] = self.library_path
pd = QProgressDialog('', '', 0, 100, self)
pd.setWindowModality(Qt.ApplicationModal)
pd.setCancelButton(None)
pd.setWindowTitle(_('Migrating database'))
pd.show()
number_of_books = db.migrate_old(self.olddb, pd)
self.olddb.close()
if number_of_books == 0:
os.remove(self.olddb.dbpath)
self.olddb = None
prefs['library_path'] = self.library_path
self.library_view.sortByColumn(*dynamic.get('sort_column', ('timestamp', Qt.DescendingOrder))) self.library_view.sortByColumn(*dynamic.get('sort_column', ('timestamp', Qt.DescendingOrder)))
if not self.library_view.restore_column_widths(): if not self.library_view.restore_column_widths():
self.library_view.resizeColumnsToContents() self.library_view.resizeColumnsToContents()
@ -1392,39 +1381,14 @@ class Main(MainWindow, Ui_MainWindow):
def initialize_database(self): def initialize_database(self):
self.library_path = prefs['library_path'] self.library_path = prefs['library_path']
self.olddb = None
if self.library_path is None: # Need to migrate to new database layout if self.library_path is None: # Need to migrate to new database layout
QMessageBox.information(self, 'Database format changed',
'''\
<p>calibre's book storage format has changed. Instead of storing book files in a database, the
files are now stored in a folder on your filesystem. You will now be asked to choose the folder
in which you want to store your books files. Any existing books will be automatically migrated.
''')
self.database_path = prefs['database_path']
if not os.access(os.path.dirname(self.database_path), os.W_OK):
error_dialog(self, _('Database does not exist'),
_('The directory in which the database should be: %s no longer exists. Please choose a new database location.')%self.database_path).exec_()
self.database_path = choose_dir(self, 'database path dialog',
_('Choose new location for database'))
if not self.database_path:
self.database_path = os.path.expanduser('~').decode(sys.getfilesystemencoding())
if not os.path.exists(self.database_path):
os.makedirs(self.database_path)
self.database_path = os.path.join(self.database_path, 'library1.db')
prefs['database_path'] = self.database_path
home = os.path.dirname(self.database_path)
if not os.path.exists(home):
home = os.getcwd()
dir = unicode(QFileDialog.getExistingDirectory(self, dir = unicode(QFileDialog.getExistingDirectory(self,
_('Choose a location for your ebook library.'), home)) _('Choose a location for your ebook library.'), os.getcwd()))
if not dir: if not dir:
dir = os.path.dirname(self.database_path) dir = os.path.expanduser('~/Library')
self.library_path = os.path.abspath(dir) self.library_path = os.path.abspath(dir)
try: if not os.path.exists(self.library_path):
self.olddb = LibraryDatabase(self.database_path) os.makedirs(self.library_path)
except:
traceback.print_exc()
self.olddb = None
def read_settings(self): def read_settings(self):

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,8 @@ recipe_modules = ['recipe_' + r for r in (
'time_magazine', 'endgadget', 'fudzilla', 'nspm_int', 'nspm', 'pescanik', 'time_magazine', 'endgadget', 'fudzilla', 'nspm_int', 'nspm', 'pescanik',
'spiegel_int', 'themarketticker', 'tomshardware', 'xkcd', 'ftd', 'zdnet', 'spiegel_int', 'themarketticker', 'tomshardware', 'xkcd', 'ftd', 'zdnet',
'joelonsoftware', 'telepolis', 'common_dreams', 'nin', 'tomshardware_de', 'joelonsoftware', 'telepolis', 'common_dreams', 'nin', 'tomshardware_de',
'pagina12', 'infobae', 'ambito', 'elargentino', 'sueddeutsche', 'pagina12', 'infobae', 'ambito', 'elargentino', 'sueddeutsche', 'the_age',
'laprensa',
)] )]
import re, imp, inspect, time, os import re, imp, inspect, time, os

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
laprensa.com.ar
'''
import urllib
from calibre.web.feeds.news import BasicNewsRecipe
class LaPrensa(BasicNewsRecipe):
title = 'La Prensa'
__author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
cover_url = 'http://www.laprensa.com.ar/imgs/logo.gif'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Argentina'
, '--publisher' , title
]
feeds = [
(u'Politica' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=4' )
,(u'Economia' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=5' )
,(u'Opinion' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=6' )
,(u'El Mundo' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=7' )
,(u'Actualidad' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=8' )
,(u'Deportes' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=9' )
,(u'Espectaculos', u'http://www.laprensa.com.ar/Rss.aspx?Rss=10')
]
def print_version(self, url):
return url.replace('.note.aspx','.NotePrint.note.aspx')
def get_article_url(self, article):
raw = article.get('link', None).encode('utf8')
final = urllib.quote(raw,':/')
return final
def preprocess_html(self, soup):
del soup.body['onload']
return soup

View File

@ -0,0 +1,55 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Matthew Briggs <hal.sulphur@gmail.com>'
__docformat__ = 'restructuredtext en'
'''
theage.com.au
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class TheAge(BasicNewsRecipe):
title = 'The Age'
description = 'Business News, World News and Breaking News in Melbourne, Australia'
__author__ = 'Matthew Briggs'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.set_handle_refresh(False)
return br
def parse_index(self):
soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read())
feeds, articles = [], []
feed = None
for tag in soup.findAll(['h3', 'a']):
if tag.name == 'h3':
if articles:
feeds.append((feed, articles))
articles = []
feed = self.tag_to_string(tag)
elif feed is not None and tag.has_key('href') and tag['href'].strip():
url = tag['href'].strip()
if url.startswith('/'):
url = 'http://www.theage.com.au' + url
title = self.tag_to_string(tag)
articles.append({
'title': title,
'url' : url,
'date' : strftime('%a, %d %b'),
'description' : '',
'content' : '',
})
return feeds

View File

@ -398,7 +398,7 @@ class RecursiveFetcher(object, LoggingInterface):
_fname = basename(iurl) _fname = basename(iurl)
if not isinstance(_fname, unicode): if not isinstance(_fname, unicode):
_fname.decode('latin1', 'replace') _fname.decode('latin1', 'replace')
_fname.encode('ascii', 'replace').replace('%', '') _fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '')
res = os.path.join(linkdiskpath, _fname) res = os.path.join(linkdiskpath, _fname)
self.downloaded_paths.append(res) self.downloaded_paths.append(res)
self.filemap[nurl] = res self.filemap[nurl] = res