Sync to trunk.

This commit is contained in:
John Schember 2009-07-08 06:26:31 -04:00
commit ccdc693b97
29 changed files with 367 additions and 118 deletions

View File

@ -308,14 +308,25 @@ def walk(dir):
yield os.path.join(record[0], f)
def strftime(fmt, t=None):
''' A version of strtime that returns unicode strings. '''
''' A version of strtime that returns unicode strings and tries to handle dates
before 1900 '''
if t is None:
t = time.localtime()
early_year = t[0] < 1900
if early_year:
fmt = fmt.replace('%Y', '_early year hack##')
t = list(t)
orig_year = t[0]
t[0] = 1900
ans = None
if iswindows:
if isinstance(fmt, unicode):
fmt = fmt.encode('mbcs')
return plugins['winutil'][0].strftime(fmt, t)
return time.strftime(fmt, t).decode(preferred_encoding, 'replace')
ans = plugins['winutil'][0].strftime(fmt, t)
ans = time.strftime(fmt, t).decode(preferred_encoding, 'replace')
if early_year:
ans = ans.replace('_early year hack##', str(orig_year))
return ans
def my_unichr(num):
try:

View File

@ -155,6 +155,9 @@ class OutputProfile(Plugin):
# The image size for comics
comic_screen_size = (584, 754)
# If True the MOBI renderer on the device supports MOBI indexing
supports_mobi_indexing = False
@classmethod
def tags_to_string(cls, tags):
return ', '.join(tags)
@ -254,6 +257,7 @@ class KindleOutput(OutputProfile):
dpi = 168.451
fbase = 16
fsizes = [12, 12, 14, 16, 18, 20, 22, 24]
supports_mobi_indexing = True
@classmethod
def tags_to_string(cls, tags):
@ -269,6 +273,7 @@ class KindleDXOutput(OutputProfile):
screen_size = (744, 1022)
dpi = 150.0
comic_screen_size = (741, 1022)
supports_mobi_indexing = True
@classmethod
def tags_to_string(cls, tags):

View File

@ -563,6 +563,8 @@ OptionRecommendation(name='list_recipes',
break
self.read_user_metadata()
self.opts.no_inline_navbars = self.opts.output_profile.supports_mobi_indexing \
and self.output_fmt == 'mobi'
def flush(self):
try:

View File

@ -80,6 +80,8 @@ class EPUBOutput(OutputFormatPlugin):
])
recommendations = set([('pretty_print', True, OptionRecommendation.HIGH)])
TITLEPAGE_COVER = '''\
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
@ -134,6 +136,21 @@ class EPUBOutput(OutputFormatPlugin):
</body>
</html>
'''
def workaround_webkit_quirks(self):
from calibre.ebooks.oeb.base import XPath
for x in self.oeb.spine:
root = x.data
body = XPath('//h:body')(root)
if body:
body = body[0]
if not hasattr(body, 'xpath'):
continue
for pre in XPath('//h:pre')(body):
if not pre.text and len(pre) == 0:
pre.tag = 'div'
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
@ -146,6 +163,7 @@ class EPUBOutput(OutputFormatPlugin):
self.workaround_ade_quirks()
self.workaround_webkit_quirks()
from calibre.ebooks.oeb.transforms.rescale import RescaleImages
RescaleImages()(oeb, opts)

View File

@ -29,7 +29,7 @@ class LRFOptions(object):
self.use_metadata_cover = True
self.output = output
self.ignore_tables = opts.linearize_tables
self.base_font_size = 0
self.base_font_size = opts.base_font_size
self.blank_after_para = opts.insert_blank_line
self.use_spine = True
self.font_delta = 0

View File

@ -367,7 +367,7 @@ class MetaInformation(object):
if self.pubdate is not None:
ans += [(_('Published'), unicode(self.pubdate.isoformat(' ')))]
if self.rights is not None:
ans += [(_('Rights'), unicode(self.rights.isoformat(' ')))]
ans += [(_('Rights'), unicode(self.rights))]
for i, x in enumerate(ans):
ans[i] = u'<tr><td><b>%s</b></td><td>%s</td></tr>'%x
return u'<table>%s</table>'%u'\n'.join(ans)

View File

@ -31,7 +31,7 @@ def metadata_from_formats(formats):
try:
return _metadata_from_formats(formats)
except:
mi = metadata_from_filename(formats[0])
mi = metadata_from_filename(list(formats)[0])
if not mi.authors:
mi.authors = [_('Unknown')]
@ -126,14 +126,10 @@ def metadata_from_filename(name, pat=None):
mi.title = match.group('title')
except IndexError:
pass
try:
mi.authors = [match.group('author')]
except IndexError:
pass
try:
au = match.group('authors')
aus = string_to_authors(au)
mi.authors = authors
mi.authors = aus
except IndexError:
pass
try:

View File

@ -452,9 +452,12 @@ class OPF(object):
def __init__(self, stream, basedir=os.getcwdu(), unquote_urls=True):
if not hasattr(stream, 'read'):
stream = open(stream, 'rb')
raw = stream.read()
if not raw:
raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
self.basedir = self.base_dir = basedir
self.path_to_html_toc = self.html_toc_fragment = None
raw, self.encoding = xml_to_unicode(stream.read(), strip_encoding_pats=True, resolve_entities=True)
raw, self.encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)
raw = raw[raw.find('<'):]
self.root = etree.fromstring(raw, self.PARSER)
self.metadata = self.metadata_path(self.root)

View File

@ -48,11 +48,7 @@ class MOBIOutput(OutputFormatPlugin):
self.opts.mobi_periodical = False
def check_for_masthead(self):
found = False
for typ in self.oeb.guide:
if type == 'masthead':
found = True
break
found = 'masthead' in self.oeb.guide
if not found:
self.oeb.log.debug('No masthead found, generating default one...')
from calibre.resources import server_resources
@ -76,12 +72,14 @@ class MOBIOutput(OutputFormatPlugin):
from calibre.ebooks.oeb.base import TOC
toc = self.oeb.toc
if toc and toc[0].klass != 'periodical':
start_href = self.oeb.spine[0].href
self.log('Converting TOC for MOBI periodical indexing...')
articles = {}
if toc.depth < 3:
sections = [TOC(klass='section')]
if toc.depth() < 3:
sections = [TOC(klass='section', title=_('All articles'),
href=start_href)]
for x in toc:
sections[0].append(x)
sections[0].nodes.append(x)
else:
sections = list(toc)
for x in sections:
@ -92,7 +90,7 @@ class MOBIOutput(OutputFormatPlugin):
a.klass = 'article'
articles[id(sec)].append(a)
sec.nodes.remove(a)
root = TOC(klass='periodical',
root = TOC(klass='periodical', href=start_href,
title=unicode(self.oeb.metadata.title[0]))
for s in sections:
if articles[id(s)]:

View File

@ -301,7 +301,7 @@ class MobiReader(object):
root = html.fromstring(self.processed_html)
if root.xpath('descendant::p/descendant::p'):
from lxml.html import soupparser
self.log.warning('Malformed markup, parsing using BeatifulSoup')
self.log.warning('Malformed markup, parsing using BeautifulSoup')
root = soupparser.fromstring(self.processed_html)
if root.tag != 'html':
@ -439,7 +439,12 @@ class MobiReader(object):
self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
self.processed_html = self.processed_html.replace('\r\n', '\n')
self.processed_html = self.processed_html.replace('> <', '>\n<')
self.processed_html = re.sub('\x14|\x15', '', self.processed_html)
self.processed_html = re.sub('\x14|\x15|\x1c|\x1d', '', self.processed_html)
def ensure_unit(self, raw, unit='px'):
if re.search(r'\d+$', raw) is not None:
raw += unit
return raw
def upshift_markup(self, root):
self.log.debug('Converting style information to CSS...')
@ -469,13 +474,13 @@ class MobiReader(object):
if attrib.has_key('height'):
height = attrib.pop('height').strip()
if height:
styles.append('margin-top: %s' % height)
styles.append('margin-top: %s' % self.ensure_unit(height))
if attrib.has_key('width'):
width = attrib.pop('width').strip()
if width:
styles.append('text-indent: %s' % width)
styles.append('text-indent: %s' % self.ensure_unit(width))
if width.startswith('-'):
styles.append('margin-left: %s' % (width[1:]))
styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
if attrib.has_key('align'):
align = attrib.pop('align').strip()
if align:

View File

@ -379,7 +379,7 @@ class MobiWriter(object):
try:
self._generate_index()
except:
self.oeb.log.exception('Failed to generate index')
self._oeb.log.exception('Failed to generate index')
self._generate_images()
@ -461,7 +461,7 @@ class MobiWriter(object):
h = child.href
if h not in self._id_offsets:
self._oeb.log.warning('Could not find TOC entry "%s", aborting indexing ...'% child.title)
self._oeb.log.warning(' Could not find TOC entry "%s", aborting indexing ...'% child.title)
return False
offset = self._id_offsets[h]
@ -573,7 +573,7 @@ class MobiWriter(object):
# Entries continues with a stream of section+articles, section+articles ...
h = child.href
if h not in self._id_offsets:
self._oeb.log.warning('Could not find TOC entry "%s", aborting indexing ...'% child.title)
self._oeb.log.warning(' Could not find TOC entry "%s", aborting indexing ...'% child.title)
return False
offset = self._id_offsets[h]
@ -1178,40 +1178,29 @@ class MobiWriter(object):
'''
toc = self._oeb.toc
nodes = list(toc.iter())[1:]
toc_conforms = True
for (i, child) in enumerate(nodes) :
if self.opts.verbose > 3 :
self._oeb.logger.info(" <title>: %-25.25s \tklass=%-15.15s \tdepth:%d playOrder=%03d" % \
if child.klass == "periodical" and child.depth() != 3 or \
child.klass == "section" and child.depth() != 2 or \
child.klass == "article" and child.depth() != 1 :
self._oeb.logger.warn('Nonconforming TOC entry: "%s" found at depth %d' % \
(child.klass, child.depth()) )
self._oeb.logger.warn(" <title>: '%-25.25s...' \t\tklass=%-15.15s \tdepth:%d \tplayOrder=%03d" % \
(child.title, child.klass, child.depth(), child.play_order) )
if child.klass == "periodical" and child.depth() != 3 :
self._oeb.logger.info('<navPoint class="periodical"> found at depth %d, nonconforming TOC' % \
child.depth() )
return False
if child.klass == "section" and child.depth() != 2 :
self._oeb.logger.info('<navPoint class="section"> found at depth %d, nonconforming TOC' % \
child.depth() )
return False
if child.klass == "article" and child.depth() != 1 :
self._oeb.logger.info('<navPoint class="article"> found at depth %d, nonconforming TOC' % \
child.depth() )
return False
toc_conforms = False
# We also need to know that we have a pubdate or timestamp in the metadata, which the Kindle needs
if self._oeb.metadata['date'] == [] and self._oeb.metadata['timestamp'] == [] :
self._oeb.logger.info('metadata missing timestamp needed for periodical')
return False
self._oeb.logger.info('metadata missing date/timestamp')
toc_conforms = False
# Periodicals also need a mastheadImage in the manifest
has_mastheadImage = 'masthead' in self._oeb.guide
if not 'masthead' in self._oeb.guide :
self._oeb.logger.info('mastheadImage missing from manifest')
toc_conforms = False
if not has_mastheadImage :
self._oeb.logger.info('mastheadImage missing from manifest, aborting periodical indexing')
return False
self._oeb.logger.info('TOC structure and pubdate verified')
return True
self._oeb.logger.info("%s" % " TOC structure conforms" if toc_conforms else " TOC structure non-conforming")
return toc_conforms
def _generate_text(self):
@ -1231,12 +1220,12 @@ class MobiWriter(object):
offset = 0
if self._compression != UNCOMPRESSED:
self._oeb.logger.info('Compressing markup content...')
self._oeb.logger.info(' Compressing markup content...')
data, overlap = self._read_text_record(text)
# Evaluate toc for conformance
if self.opts.mobi_periodical :
self._oeb.logger.info('--mobi-periodical specified, evaluating TOC for periodical conformance ...')
self._oeb.logger.info(' MOBI periodical specified, evaluating TOC for periodical conformance ...')
self._conforming_periodical_toc = self._evaluate_periodical_toc()
# This routine decides whether to build flat or structured based on self._conforming_periodical_toc
@ -1249,11 +1238,11 @@ class MobiWriter(object):
if len(entries) :
self._indexable = self._generate_indexed_navpoints()
else :
self._oeb.logger.info('No entries found in TOC ...')
self._oeb.logger.info(' No entries found in TOC ...')
self._indexable = False
if not self._indexable :
self._oeb.logger.info('Writing unindexed mobi ...')
self._oeb.logger.info(' Writing unindexed mobi ...')
while len(data) > 0:
if self._compression == PALMDOC:
@ -1271,6 +1260,7 @@ class MobiWriter(object):
while breaks and (breaks[0] - offset) < RECORD_SIZE:
# .pop returns item, removes it from list
pbreak = (breaks.pop(0) - running) >> 3
if self.opts.verbose > 2 :
self._oeb.logger.info('pbreak = 0x%X at 0x%X' % (pbreak, record.tell()) )
encoded = decint(pbreak, DECINT_FORWARD)
record.write(encoded)
@ -1384,7 +1374,7 @@ class MobiWriter(object):
# 0x002 MOBI book (chapter - chapter navigation)
# 0x101 News - Hierarchical navigation with sections and articles
# 0x102 News feed - Flat navigation
# 0x103 News magazine - same as 1x101
# 0x103 News magazine - same as 0x101
# 0xC - 0xF : Text encoding (65001 is utf-8)
# 0x10 - 0x13 : UID
# 0x14 - 0x17 : Generator version
@ -1545,7 +1535,7 @@ class MobiWriter(object):
exth.write(data)
nrecs += 1
if term == 'rights' :
rights = unicode(oeb.metadata.rights[0])
rights = unicode(oeb.metadata.rights[0]).encode('utf-8')
exth.write(pack('>II', EXTH_CODES['rights'], len(rights) + 8))
exth.write(rights)
@ -1614,7 +1604,7 @@ class MobiWriter(object):
self._write(record)
def _generate_index(self):
self._oeb.log('Generating primary index ...')
self._oeb.log('Generating INDX ...')
self._primary_index_record = None
# Build the NCXEntries and INDX
@ -1953,6 +1943,8 @@ class MobiWriter(object):
first = False
else :
self._oeb.logger.info('Generating flat CTOC ...')
previousOffset = -1
currentOffset = 0
for (i, child) in enumerate(toc.iter()):
# Only add chapters or articles at depth==1
# no class defaults to 'chapter'
@ -1961,8 +1953,20 @@ class MobiWriter(object):
if self.opts.verbose > 2 :
self._oeb.logger.info("adding (klass:%s depth:%d) %s to flat ctoc" % \
(child.klass, child.depth(), child) )
# Test to see if this child's offset is the same as the previous child's
# offset, skip it
h = child.href
currentOffset = self._id_offsets[h]
# print "_generate_ctoc: child offset: 0x%X" % currentOffset
if currentOffset != previousOffset :
self._add_flat_ctoc_node(child, ctoc)
reduced_toc.append(child)
previousOffset = currentOffset
else :
self._oeb.logger.warn(" Ignoring redundant href: %s in '%s'" % (h, child.title))
first = False
else :
if self.opts.verbose > 2 :

View File

@ -1468,7 +1468,9 @@ class TOC(object):
node.to_opf1(tour)
return tour
def to_ncx(self, parent):
def to_ncx(self, parent=None):
if parent is None:
parent = etree.Element(NCX('navMap'))
for node in self.nodes:
id = node.id or unicode(uuid.uuid4())
attrib = {'id': id, 'playOrder': str(node.play_order)}

View File

@ -9,6 +9,8 @@ from lxml import etree
from calibre.customize.conversion import OutputFormatPlugin
from calibre import CurrentDir
from calibre.customize.conversion import OptionRecommendation
from urllib import unquote
class OEBOutput(OutputFormatPlugin):
@ -17,6 +19,9 @@ class OEBOutput(OutputFormatPlugin):
author = 'Kovid Goyal'
file_type = 'oeb'
recommendations = set([('pretty_print', True, OptionRecommendation.HIGH)])
def convert(self, oeb_book, output_path, input_plugin, opts, log):
self.log, self.opts = log, opts
if not os.path.exists(output_path):

View File

@ -169,7 +169,8 @@ class Stylizer(object):
if not matches and class_sel_pat.match(text):
found = False
for x in tree.xpath('//*[@class]'):
if text.lower().endswith('.'+x.get('class').lower()):
if text.lower().endswith('.'+x.get('class').lower()) and \
text.lower() != text:
matches.append(x)
found = True
if found:

View File

@ -34,7 +34,8 @@ class Clean(object):
for x in list(self.oeb.guide):
href = urldefrag(self.oeb.guide[x].href)[0]
if x.lower() not in ('cover', 'titlepage'):
if x.lower() not in ('cover', 'titlepage', 'masthead', 'toc',
'title-page', 'copyright-page'):
self.oeb.guide.remove(x)

View File

@ -16,7 +16,7 @@ from lxml import etree
from lxml.cssselect import CSSSelector
from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \
urldefrag, rewrite_links, urlunquote, barename
urldefrag, rewrite_links, urlunquote, barename, XHTML
from calibre.ebooks.epub import rules
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
@ -216,7 +216,25 @@ class FlowSplitter(object):
self.trees.append(before)
tree = after
self.trees.append(tree)
self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
trees, ids = [], set([])
for tree in self.trees:
root = tree.getroot()
if self.is_page_empty(root):
discarded_ids = root.xpath('//*[@id]')
for x in discarded_ids:
x = x.get('id')
if not x.startswith('calibre_'):
ids.add(x)
else:
if ids:
body = self.get_body(root)
if body is not None:
for x in ids:
body.insert(0, body.makeelement(XHTML('div'),
id=x, style='height:0pt'))
ids = set([])
trees.append(tree)
self.trees = trees
def get_body(self, root):
body = root.xpath('//h:body', namespaces=NAMESPACES)

View File

@ -107,8 +107,6 @@ class Adder(QObject):
self.callback(self.paths, self.names, self.infos)
self.callback_called = True
def update(self):
if not self.ids:
self.timer.stop()

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 306 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 553 B

View File

@ -657,6 +657,8 @@ class LibraryDatabase2(LibraryDatabase):
author = sanitize_file_name(authors.split(',')[0][:self.PATH_LIMIT]).decode(filesystem_encoding, 'replace')
title = sanitize_file_name(self.title(id, index_is_id=True)[:self.PATH_LIMIT]).decode(filesystem_encoding, 'replace')
name = title + ' - ' + author
while name.endswith('.'):
name = name[:-1]
return name
def rmtree(self, path):
@ -1074,6 +1076,8 @@ class LibraryDatabase2(LibraryDatabase):
self.set_isbn(id, mi.isbn, notify=False)
if mi.series_index:
self.set_series_index(id, mi.series_index, notify=False)
if mi.pubdate:
self.set_pubdate(id, mi.pubdate, notify=False)
if getattr(mi, 'timestamp', None) is not None:
self.set_timestamp(id, mi.timestamp, notify=False)
self.set_path(id, True)
@ -1734,7 +1738,7 @@ books_series_link feeds
formats = self.find_books_in_directory(dirpath, True)
if not formats:
return
formats = list(formats)
mi = metadata_from_formats(formats)
if mi.title is None:
return

View File

@ -366,10 +366,9 @@ class LibraryServer(object):
@expose
def index(self, **kwargs):
'The / URL'
stanza = cherrypy.request.headers.get('Stanza-Device-Name', 919)
if stanza == 919:
return self.static('index.html')
return self.stanza()
want_opds = cherrypy.request.headers.get('Stanza-Device-Name', 919) != \
919 or cherrypy.request.headers.get('Want-OPDS-Catalog', 919) != 919
return self.stanza() if want_opds else self.static('index.html')
@expose

View File

@ -469,6 +469,7 @@ class BasicNewsRecipe(Recipe):
self.username = options.username
self.password = options.password
self.lrf = options.lrf
self.include_navbars = not options.no_inline_navbars
self.output_dir = os.path.abspath(self.output_dir)
if options.test:
@ -539,7 +540,7 @@ class BasicNewsRecipe(Recipe):
if first_fetch and job_info:
url, f, a, feed_len = job_info
body = soup.find('body')
if body is not None:
if body is not None and self.include_navbars:
templ = self.navbar.generate(False, f, a, feed_len,
not self.has_single_feed,
url, __appname__,
@ -907,6 +908,7 @@ class BasicNewsRecipe(Recipe):
body = soup.find('body')
if body is not None:
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
if self.include_navbars:
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix,
@ -923,7 +925,7 @@ class BasicNewsRecipe(Recipe):
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
desc = f.description
desc = getattr(f, 'description', None)
if not desc:
desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,

View File

@ -51,9 +51,11 @@ recipe_modules = ['recipe_' + r for r in (
'theeconomictimes_india', '7dias', 'buenosaireseconomico',
'diagonales', 'miradasalsur', 'newsweek_argentina', 'veintitres',
'gva_be', 'hln', 'tijd', 'degentenaar', 'inquirer_net', 'uncrate',
'fastcompany', 'accountancyage',
'fastcompany', 'accountancyage', 'laprensa_hn', 'latribuna',
'eltiempo_hn',
)]
import re, imp, inspect, time, os
from calibre.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe, AutomaticNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.tiempo.hn
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class ElTiempoHn(BasicNewsRecipe):
title = 'El Tiempo - Honduras'
__author__ = 'Darko Miletic'
description = 'Noticias de Honduras y mundo'
publisher = 'El Tiempo'
category = 'news, politics, Honduras'
oldest_article = 2
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
language = _('Spanish')
lang = 'es-HN'
direction = 'ltr'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} img {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em}"'
remove_tags = [dict(name=['form','object','embed','base'])]
keep_only_tags = [dict(name='td' , attrs={'id':'mainbodycont'})]
feeds = [(u'Noticias', u'http://www.tiempo.hn/index.php?format=feed&type=rss')]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)

View File

@ -0,0 +1,54 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.laprensahn.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class LaPrensaHn(BasicNewsRecipe):
title = 'La Prensa - Honduras'
__author__ = 'Darko Miletic'
description = 'Noticias de Honduras y mundo'
publisher = 'La Prensa'
category = 'news, politics, Honduras'
oldest_article = 2
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
language = _('Spanish')
lang = 'es-HN'
direction = 'ltr'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
remove_tags = [dict(name=['form','object','embed'])]
keep_only_tags = [
dict(name='h1' , attrs={'class':'titulo1'})
,dict(name='div', attrs={'class':['sumario11','hora','texto']})
]
feeds = [(u'Noticias', u'http://feeds.feedburner.com/laprensa_titulares')]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -0,0 +1,65 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.latribuna.hn
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class LaTribuna(BasicNewsRecipe):
title = 'La Tribuna - Honduras'
__author__ = 'Darko Miletic'
description = 'Noticias de Honduras y mundo'
publisher = 'La Tribuna'
category = 'news, politics, Honduras'
oldest_article = 2
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
language = _('Spanish')
lang = 'es-HN'
direction = 'ltr'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
remove_tags = [dict(name=['form','object','embed'])]
keep_only_tags = [
dict(name='p', attrs={'id':['BlogTitle','BlogDate']})
,dict(name='div', attrs={'id':'BlogContent'})
]
feeds = [(u'Noticias', u'http://www.latribuna.hn/web2.0/?feed=rss')]
def print_version(self, url):
return url + '&print=1'
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup('http://www.latribuna.hn/web2.0/')
cover_item = soup.find('div',attrs={'class':'portada_impresa'})
if cover_item:
cover_url = cover_item.a.img['src']
return cover_url

View File

@ -53,6 +53,10 @@ class WallStreetJournal(BasicNewsRecipe):
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div'
for tag in soup.findAll('div', dict(id=["articleImage_1", "articleImage_2", "articleImage_3", "articleImage_4", "articleImage_5", "articleImage_6", "articleImage_7"])):
tag.extract()
return soup
def get_article_url(self, article):
@ -70,7 +74,7 @@ class WallStreetJournal(BasicNewsRecipe):
#('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'),
(' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'),
(' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'),
# ('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'),
#('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'),
('Today\'s Newspaper - Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'),
('Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'),
('Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'),