mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Sync to trunk.
This commit is contained in:
commit
50b71bd449
@ -361,6 +361,8 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252'):
|
||||
return '&'+ent+';'
|
||||
if ent == 'apos':
|
||||
return "'"
|
||||
if ent == 'hellips':
|
||||
ent = 'hellip'
|
||||
if ent.startswith(u'#x'):
|
||||
num = int(ent[2:], 16)
|
||||
if encoding is None or num > 255:
|
||||
@ -382,6 +384,15 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252'):
|
||||
except KeyError:
|
||||
return '&'+ent+';'
|
||||
|
||||
_ent_pat = re.compile(r'&(\S+);')
|
||||
|
||||
def prepare_string_for_xml(raw, attribute=False):
|
||||
raw = _ent_pat.sub(entity_to_unicode, raw)
|
||||
raw = raw.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||
if attribute:
|
||||
raw = raw.replace('"', '"').replace("'", ''')
|
||||
return raw
|
||||
|
||||
if isosx:
|
||||
fdir = os.path.expanduser('~/.fonts')
|
||||
try:
|
||||
|
@ -2,7 +2,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__appname__ = 'calibre'
|
||||
__version__ = '0.6.0b16'
|
||||
__version__ = '0.6.0b17'
|
||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
|
||||
import re
|
||||
|
@ -57,6 +57,35 @@ class HTMLRenderer(object):
|
||||
self.loop.exit(0)
|
||||
|
||||
|
||||
def extract_cover_from_embedded_svg(html, base, log):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.oeb.base import XPath, SVG, XLINK
|
||||
root = etree.fromstring(html)
|
||||
|
||||
svg = XPath('//svg:svg')(root)
|
||||
if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
|
||||
image = svg[0][0]
|
||||
href = image.get(XLINK('href'), None)
|
||||
path = os.path.join(base, *href.split('/'))
|
||||
if href and os.access(path, os.R_OK):
|
||||
return open(path, 'rb').read()
|
||||
|
||||
def render_html_svg_workaround(path_to_html, log, width=590, height=750):
|
||||
from calibre.ebooks.oeb.base import SVG_NS
|
||||
raw = open(path_to_html, 'rb').read()
|
||||
data = None
|
||||
if SVG_NS in raw:
|
||||
try:
|
||||
data = extract_cover_from_embedded_svg(raw,
|
||||
os.path.dirname(path_to_html), log)
|
||||
except:
|
||||
pass
|
||||
if data is None:
|
||||
renderer = render_html(path_to_html, width, height)
|
||||
data = getattr(renderer, 'data', None)
|
||||
return data
|
||||
|
||||
|
||||
def render_html(path_to_html, width=590, height=750):
|
||||
from PyQt4.QtWebKit import QWebPage
|
||||
from PyQt4.Qt import QEventLoop, QPalette, Qt, SIGNAL, QUrl, QSize
|
||||
|
@ -54,7 +54,7 @@ class EPUBInput(InputFormatPlugin):
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def rationalize_cover(self, opf):
|
||||
def rationalize_cover(self, opf, log):
|
||||
guide_cover, guide_elem = None, None
|
||||
for guide_elem in opf.iterguide():
|
||||
if guide_elem.get('type', '').lower() == 'cover':
|
||||
@ -65,28 +65,37 @@ class EPUBInput(InputFormatPlugin):
|
||||
spine = list(opf.iterspine())
|
||||
if not spine:
|
||||
return
|
||||
# Check if the cover specified in the guide is also
|
||||
# the first element in spine
|
||||
idref = spine[0].get('idref', '')
|
||||
manifest = list(opf.itermanifest())
|
||||
if not manifest:
|
||||
return
|
||||
if manifest[0].get('id', False) != idref:
|
||||
elem = [x for x in manifest if x.get('id', '') == idref]
|
||||
if not elem or elem[0].get('href', None) != guide_cover:
|
||||
return
|
||||
log('Found HTML cover', guide_cover)
|
||||
|
||||
# Remove from spine as covers must be treated
|
||||
# specially
|
||||
spine[0].getparent().remove(spine[0])
|
||||
guide_elem.set('href', 'calibre_raster_cover.jpg')
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
t = etree.SubElement(elem[0].getparent(), OPF('item'),
|
||||
href=guide_elem.get('href'), id='calibre_raster_cover')
|
||||
t.set('media-type', 'image/jpeg')
|
||||
for elem in list(opf.iterguide()):
|
||||
if elem.get('type', '').lower() == 'titlepage':
|
||||
elem.getparent().remove(elem)
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
t = etree.SubElement(guide_elem.getparent(), OPF('reference'))
|
||||
t.set('type', 'titlepage')
|
||||
t.set('href', guide_cover)
|
||||
t.set('title', 'Title Page')
|
||||
from calibre.ebooks import render_html
|
||||
renderer = render_html(guide_cover)
|
||||
from calibre.ebooks import render_html_svg_workaround
|
||||
renderer = render_html_svg_workaround(guide_cover, log)
|
||||
if renderer is not None:
|
||||
open('calibre_raster_cover.jpg', 'wb').write(
|
||||
renderer.data)
|
||||
|
||||
renderer)
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
@ -121,7 +130,7 @@ class EPUBInput(InputFormatPlugin):
|
||||
for elem in opf.iterguide():
|
||||
elem.set('href', delta+elem.get('href'))
|
||||
|
||||
self.rationalize_cover(opf)
|
||||
self.rationalize_cover(opf, log)
|
||||
|
||||
with open('content.opf', 'wb') as nopf:
|
||||
nopf.write(opf.render())
|
||||
|
@ -12,7 +12,7 @@ from urllib import unquote
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.constants import __appname__, __version__
|
||||
from calibre import strftime, guess_type
|
||||
from calibre import strftime, guess_type, prepare_string_for_xml
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
|
||||
from lxml import etree
|
||||
@ -210,6 +210,7 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
id, href = self.oeb.manifest.generate('calibre-logo',
|
||||
'calibre-logo.png')
|
||||
self.oeb.manifest.add(id, href, 'image/png', data=img_data)
|
||||
title, author = map(prepare_string_for_xml, (title, author))
|
||||
html = self.TITLEPAGE%dict(title=title, author=author,
|
||||
date=strftime('%d %b, %Y'),
|
||||
app=__appname__ +' '+__version__,
|
||||
|
@ -5,14 +5,10 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
'''Read meta information from epub files'''
|
||||
|
||||
import os, time
|
||||
import os
|
||||
from cStringIO import StringIO
|
||||
from contextlib import closing
|
||||
|
||||
from PyQt4.Qt import QUrl, QEventLoop, QSize, QByteArray, QBuffer, \
|
||||
SIGNAL, QPainter, QImage, QObject, QApplication, Qt, QPalette
|
||||
from PyQt4.QtWebKit import QWebPage
|
||||
|
||||
from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
@ -102,64 +98,9 @@ class OCFDirReader(OCFReader):
|
||||
def open(self, path, *args, **kwargs):
|
||||
return open(os.path.join(self.root, path), *args, **kwargs)
|
||||
|
||||
class CoverRenderer(QObject):
|
||||
WIDTH = 600
|
||||
HEIGHT = 800
|
||||
|
||||
def __init__(self, path):
|
||||
if QApplication.instance() is None:
|
||||
QApplication([])
|
||||
QObject.__init__(self)
|
||||
self.loop = QEventLoop()
|
||||
self.page = QWebPage()
|
||||
pal = self.page.palette()
|
||||
pal.setBrush(QPalette.Background, Qt.white)
|
||||
self.page.setPalette(pal)
|
||||
self.page.setViewportSize(QSize(self.WIDTH, self.HEIGHT))
|
||||
self.page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
|
||||
self.page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
|
||||
QObject.connect(self.page, SIGNAL('loadFinished(bool)'), self.render_html)
|
||||
self._image_data = None
|
||||
self.rendered = False
|
||||
url = QUrl.fromLocalFile(os.path.normpath(path))
|
||||
self.page.mainFrame().load(url)
|
||||
|
||||
def render_html(self, ok):
|
||||
try:
|
||||
if not ok:
|
||||
self.rendered = True
|
||||
return
|
||||
image = QImage(self.page.viewportSize(), QImage.Format_ARGB32)
|
||||
image.setDotsPerMeterX(96*(100/2.54))
|
||||
image.setDotsPerMeterY(96*(100/2.54))
|
||||
painter = QPainter(image)
|
||||
self.page.mainFrame().render(painter)
|
||||
painter.end()
|
||||
ba = QByteArray()
|
||||
buf = QBuffer(ba)
|
||||
buf.open(QBuffer.WriteOnly)
|
||||
image.save(buf, 'JPEG')
|
||||
self._image_data = str(ba.data())
|
||||
finally:
|
||||
self.loop.exit(0)
|
||||
self.rendered = True
|
||||
|
||||
def image_data():
|
||||
def fget(self):
|
||||
if not self.rendered:
|
||||
self.loop.exec_()
|
||||
count = 0
|
||||
while count < 50 and not self.rendered:
|
||||
time.sleep(0.1)
|
||||
count += 1
|
||||
return self._image_data
|
||||
return property(fget=fget)
|
||||
image_data = image_data()
|
||||
|
||||
|
||||
def get_cover(opf, opf_path, stream):
|
||||
from calibre.gui2 import is_ok_to_use_qt
|
||||
if not is_ok_to_use_qt(): return None
|
||||
from calibre.ebooks import render_html_svg_workaround
|
||||
from calibre.utils.logging import default_log
|
||||
spine = list(opf.spine_items())
|
||||
if not spine:
|
||||
return
|
||||
@ -172,8 +113,7 @@ def get_cover(opf, opf_path, stream):
|
||||
cpage = os.path.join(tdir, os.path.dirname(opf_path), cpage)
|
||||
if not os.path.exists(cpage):
|
||||
return
|
||||
cr = CoverRenderer(cpage)
|
||||
return cr.image_data
|
||||
return render_html_svg_workaround(cpage, default_log)
|
||||
|
||||
def get_metadata(stream, extract_cover=True):
|
||||
""" Return metadata as a :class:`MetaInformation` object """
|
||||
|
@ -443,7 +443,7 @@ class MobiReader(object):
|
||||
self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
|
||||
self.processed_html = self.processed_html.replace('\r\n', '\n')
|
||||
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
||||
self.processed_html = re.sub('\x14|\x15|\x1c|\x1d', '', self.processed_html)
|
||||
self.processed_html = re.sub('\x14|\x15|\x1c|\x1d|\xef|\x12|\x13|\xec', '', self.processed_html)
|
||||
|
||||
def ensure_unit(self, raw, unit='px'):
|
||||
if re.search(r'\d+$', raw) is not None:
|
||||
|
@ -1556,7 +1556,8 @@ class MobiWriter(object):
|
||||
else:
|
||||
raise NotImplementedError("missing date or timestamp needed for mobi_periodical")
|
||||
|
||||
if oeb.metadata.cover:
|
||||
if oeb.metadata.cover and \
|
||||
unicode(oeb.metadata.cover[0]) in oeb.manifest.ids:
|
||||
id = unicode(oeb.metadata.cover[0])
|
||||
item = oeb.manifest.ids[id]
|
||||
href = item.href
|
||||
@ -2028,7 +2029,7 @@ class MobiWriter(object):
|
||||
indices.write(pack('>H', pos)) # Save the offset for IDXTIndices
|
||||
name = "%04X"%count
|
||||
indxt.write(chr(len(name)) + name) # Write the name
|
||||
indxt.write(INDXT['periodical']) # entryType [0x0F | 0xDF | 0xFF | 0x3F]
|
||||
indxt.write(INDXT['periodical']) # entryType [0x0F | 0xDF | 0xFF | 0x3F]
|
||||
indxt.write(chr(1)) # subType 1
|
||||
indxt.write(decint(offset, DECINT_FORWARD)) # offset
|
||||
indxt.write(decint(length, DECINT_FORWARD)) # length
|
||||
|
@ -759,6 +759,15 @@ class Manifest(object):
|
||||
return u'Item(id=%r, href=%r, media_type=%r)' \
|
||||
% (self.id, self.href, self.media_type)
|
||||
|
||||
def _parse_xml(self, data):
|
||||
try:
|
||||
return etree.fromstring(data)
|
||||
except etree.XMLSyntaxError, err:
|
||||
if getattr(err, 'code', 0) == 26 or str(err).startswith('Entity'):
|
||||
data = xml_to_unicode(data, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
return etree.fromstring(data)
|
||||
|
||||
def _parse_xhtml(self, data):
|
||||
self.oeb.log.debug('Parsing', self.href, '...')
|
||||
# Convert to Unicode and normalize line endings
|
||||
@ -952,7 +961,7 @@ class Manifest(object):
|
||||
elif self.media_type.lower() in OEB_DOCS:
|
||||
data = self._parse_xhtml(data)
|
||||
elif self.media_type.lower()[-4:] in ('+xml', '/xml'):
|
||||
data = etree.fromstring(data)
|
||||
data = self._parse_xml(data)
|
||||
elif self.media_type.lower() in OEB_STYLES:
|
||||
data = self._parse_css(data)
|
||||
elif 'text' in self.media_type.lower():
|
||||
|
@ -27,7 +27,6 @@ from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
|
||||
OEBError, OEBBook, DirContainer
|
||||
from calibre.ebooks.oeb.writer import OEBWriter
|
||||
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
|
||||
from calibre.ebooks.metadata.epub import CoverRenderer
|
||||
from calibre.startup import get_lang
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.constants import __appname__, __version__
|
||||
@ -343,8 +342,11 @@ class OEBReader(object):
|
||||
|
||||
descriptionElement = xpath(child,
|
||||
'descendant::calibre:meta[@name = "description"]')
|
||||
if descriptionElement :
|
||||
description = descriptionElement[0].text
|
||||
if descriptionElement:
|
||||
description = etree.tostring(descriptionElement[0],
|
||||
method='text', encoding=unicode).strip()
|
||||
if not description:
|
||||
description = None
|
||||
else :
|
||||
description = None
|
||||
|
||||
@ -524,12 +526,14 @@ class OEBReader(object):
|
||||
return
|
||||
|
||||
def _cover_from_html(self, hcover):
|
||||
from calibre.ebooks import render_html_svg_workaround
|
||||
with TemporaryDirectory('_html_cover') as tdir:
|
||||
writer = OEBWriter()
|
||||
writer(self.oeb, tdir)
|
||||
path = os.path.join(tdir, urlunquote(hcover.href))
|
||||
renderer = CoverRenderer(path)
|
||||
data = renderer.image_data
|
||||
data = render_html_svg_workaround(path, self.logger)
|
||||
if not data:
|
||||
data = ''
|
||||
id, href = self.oeb.manifest.generate('cover', 'cover.jpeg')
|
||||
item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
|
||||
return item
|
||||
|
@ -97,6 +97,8 @@ class MergeMetadata(object):
|
||||
id = old_cover = None
|
||||
if 'cover' in self.oeb.guide:
|
||||
old_cover = self.oeb.guide['cover']
|
||||
if prefer_metadata_cover and old_cover is not None:
|
||||
cdata = ''
|
||||
if cdata:
|
||||
self.oeb.guide.remove('cover')
|
||||
self.oeb.guide.remove('titlepage')
|
||||
@ -106,6 +108,10 @@ class MergeMetadata(object):
|
||||
if not cdata:
|
||||
return item.id
|
||||
self.oeb.manifest.remove(item)
|
||||
elif not cdata:
|
||||
id = self.oeb.manifest.generate(id='cover')
|
||||
self.oeb.manifest.add(id, old_cover.href, 'image/jpeg')
|
||||
return id
|
||||
if cdata:
|
||||
id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
|
||||
self.oeb.manifest.add(id, href, 'image/jpeg', data=cdata)
|
||||
|
@ -301,30 +301,26 @@ class FlowSplitter(object):
|
||||
|
||||
# Tree 1
|
||||
hit_split_point = False
|
||||
for elem in list(body.iterdescendants(etree.Element)):
|
||||
for elem in list(body.iterdescendants()):
|
||||
if elem is split_point:
|
||||
hit_split_point = True
|
||||
if before:
|
||||
x = elem.get('id', None)
|
||||
nix_element(elem)
|
||||
|
||||
continue
|
||||
if hit_split_point:
|
||||
x = elem.get('id', None)
|
||||
nix_element(elem)
|
||||
|
||||
|
||||
# Tree 2
|
||||
hit_split_point = False
|
||||
for elem in list(body2.iterdescendants(etree.Element)):
|
||||
for elem in list(body2.iterdescendants()):
|
||||
if elem is split_point2:
|
||||
hit_split_point = True
|
||||
if not before:
|
||||
x = elem.get('id', None)
|
||||
nix_element(elem, top=False)
|
||||
continue
|
||||
if not hit_split_point:
|
||||
x = elem.get('id', None)
|
||||
nix_element(elem, top=False)
|
||||
body2.text = '\n'
|
||||
|
||||
|
@ -53,7 +53,7 @@
|
||||
<item row="2" column="0">
|
||||
<widget class="QLabel" name="label_8">
|
||||
<property name="text">
|
||||
<string>Author S&ort: </string>
|
||||
<string>Author s&ort: </string>
|
||||
</property>
|
||||
<property name="alignment">
|
||||
<set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
|
||||
@ -118,7 +118,7 @@
|
||||
<item row="5" column="0">
|
||||
<widget class="QLabel" name="label_4">
|
||||
<property name="text">
|
||||
<string>Add Ta&gs: </string>
|
||||
<string>Add ta&gs: </string>
|
||||
</property>
|
||||
<property name="alignment">
|
||||
<set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
|
||||
|
@ -392,7 +392,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
|
||||
self.tags.update_tags_cache(self.db.all_tags())
|
||||
|
||||
def fetch_cover(self):
|
||||
isbn = unicode(self.isbn.text()).strip()
|
||||
isbn = re.sub(r'[^0-9a-zA-Z]', '', unicode(self.isbn.text())).strip()
|
||||
self.fetch_cover_button.setEnabled(False)
|
||||
self.setCursor(Qt.WaitCursor)
|
||||
title, author = map(unicode, (self.title.text(), self.authors.text()))
|
||||
@ -510,7 +510,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
|
||||
aus = qstring_to_unicode(self.author_sort.text())
|
||||
if aus:
|
||||
self.db.set_author_sort(self.id, aus, notify=False)
|
||||
self.db.set_isbn(self.id, qstring_to_unicode(self.isbn.text()), notify=False)
|
||||
self.db.set_isbn(self.id,
|
||||
re.sub(r'[^0-9a-zA-Z]', '', unicode(self.isbn.text())), notify=False)
|
||||
self.db.set_rating(self.id, 2*self.rating.value(), notify=False)
|
||||
self.db.set_publisher(self.id, qstring_to_unicode(self.publisher.currentText()), notify=False)
|
||||
self.db.set_tags(self.id, qstring_to_unicode(self.tags.text()).split(','), notify=False)
|
||||
|
@ -1873,13 +1873,19 @@ def main(args=sys.argv):
|
||||
return run_gui(opts, args, actions, listener, app)
|
||||
else:
|
||||
return run_gui(opts, args, actions, listener, app)
|
||||
otherinstance = False
|
||||
try:
|
||||
listener = Listener(address=ADDRESS)
|
||||
except socket.error: # Good si is correct
|
||||
communicate(args)
|
||||
except socket.error: # Good si is correct (on UNIX)
|
||||
otherinstance = True
|
||||
else:
|
||||
# On windows only singleinstance can be trusted
|
||||
otherinstance = True if iswindows else False
|
||||
if not otherinstance:
|
||||
return run_gui(opts, args, actions, listener, app)
|
||||
|
||||
communicate(args)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
|
@ -20,8 +20,8 @@ What formats does |app| support conversion to/from?
|
||||
|app| supports the conversion of many input formats to many output formats.
|
||||
It can convert every input format in the following list, to every output format.
|
||||
|
||||
*Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, MOBI, ODT, PDF, PRC**, RTF, TXT
|
||||
*Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PDF, TXT
|
||||
*Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TXT
|
||||
*Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TXT
|
||||
|
||||
** PRC is a generic format, |app| supports PRC files with TextRead and MOBIBook headers
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
'''
|
||||
Trac Macro to generate an end use Changelog from the svn logs.
|
||||
'''
|
||||
import re, collections, time
|
||||
import re, collections, time, os
|
||||
|
||||
from bzrlib import log as blog, branch
|
||||
|
||||
@ -12,48 +12,55 @@ from trac.wiki.macros import WikiMacroBase
|
||||
from trac.util import Markup
|
||||
|
||||
|
||||
BZR_PATH = '/var/bzr/code/calibre/trunk'
|
||||
BZR_PATH = '/usr/local/calibre'
|
||||
|
||||
class ChangelogFormatter(blog.LogFormatter):
|
||||
|
||||
|
||||
supports_tags = True
|
||||
supports_merge_revisions = False
|
||||
|
||||
_show_advice = False
|
||||
|
||||
def __init__(self, num_of_versions=20):
|
||||
self.num_of_versions = num_of_versions
|
||||
self.messages = collections.deque()
|
||||
self.entries = []
|
||||
self.current_entry = None
|
||||
|
||||
self.current_entry = None
|
||||
|
||||
def log_revision(self, r):
|
||||
if len(self.entries) > self.num_of_versions-1:
|
||||
return
|
||||
msg = r.rev.message
|
||||
match = re.match(r'version\s+(\d+\.\d+.\d+)', msg)
|
||||
|
||||
|
||||
if match:
|
||||
if self.current_entry is not None:
|
||||
self.entries.append((self.current_entry, set(self.messages)))
|
||||
timestamp = r.rev.timezone + r.rev.timestamp
|
||||
self.current_entry = match.group(1) + time.strftime(' (%d %b, %Y)', time.gmtime(timestamp))
|
||||
self.messages = collections.deque()
|
||||
|
||||
|
||||
else:
|
||||
if re.search(r'[a-zA-Z]', msg) and len(msg.strip()) > 5:
|
||||
if 'translation' not in msg and not msg.startswith('IGN'):
|
||||
self.messages.append(msg.strip())
|
||||
|
||||
|
||||
def to_wiki_txt(self):
|
||||
txt = ['= Changelog =\n[[PageOutline]]']
|
||||
for entry in self.entries:
|
||||
txt.append(u'----\n== Version '+entry[0]+' ==')
|
||||
for msg in entry[1]:
|
||||
txt.append(u' * ' + msg)
|
||||
|
||||
if entry[0] == '0.6.0':
|
||||
txt.append(u'For a list of new features in 0.6.0 see http://calibre.kovidgoyal.net/new_in_6')
|
||||
else:
|
||||
for msg in entry[1]:
|
||||
txt.append(u' * ' + msg)
|
||||
|
||||
return u'\n'.join(txt)
|
||||
|
||||
|
||||
def bzr_log_to_txt():
|
||||
b = branch.Branch.open(BZR_PATH)
|
||||
path = BZR_PATH
|
||||
if not os.path.exists(path):
|
||||
path = '/home/kovid/work/calibre'
|
||||
b = branch.Branch.open(path)
|
||||
lf = ChangelogFormatter()
|
||||
blog.show_log(b, lf)
|
||||
return lf.to_wiki_txt()
|
||||
@ -68,6 +75,6 @@ class ChangeLogMacro(WikiMacroBase):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print bzr_log_to_txt()
|
||||
|
||||
|
||||
print bzr_log_to_txt().encode('utf-8')
|
||||
|
||||
|
||||
|
@ -4,9 +4,9 @@
|
||||
#
|
||||
msgid ""
|
||||
msgstr ""
|
||||
"Project-Id-Version: calibre 0.6.0b14\n"
|
||||
"POT-Creation-Date: 2009-07-19 12:31+MDT\n"
|
||||
"PO-Revision-Date: 2009-07-19 12:31+MDT\n"
|
||||
"Project-Id-Version: calibre 0.6.0b16\n"
|
||||
"POT-Creation-Date: 2009-07-22 07:39+MDT\n"
|
||||
"PO-Revision-Date: 2009-07-22 07:39+MDT\n"
|
||||
"Last-Translator: Automatically generated\n"
|
||||
"Language-Team: LANGUAGE\n"
|
||||
"MIME-Version: 1.0\n"
|
||||
@ -69,8 +69,8 @@ msgstr ""
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/reader.py:136
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/reader.py:138
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/transforms/jacket.py:84
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/pdb/ereader/writer.py:101
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/pdb/ereader/writer.py:102
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/pdb/ereader/writer.py:103
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/pdb/ereader/writer.py:104
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/pdb/input.py:26
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/pdb/palmdoc/writer.py:29
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/pdb/ztxt/writer.py:27
|
||||
@ -107,7 +107,7 @@ msgstr ""
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/comicconf.py:48
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/fetch_metadata.py:106
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/fetch_metadata.py:139
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:345
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:348
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/scheduler.py:34
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/scheduler.py:39
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/scheduler.py:40
|
||||
@ -126,8 +126,8 @@ msgstr ""
|
||||
#: /home/kovid/work/calibre/src/calibre/library/database2.py:1430
|
||||
#: /home/kovid/work/calibre/src/calibre/library/database2.py:1514
|
||||
#: /home/kovid/work/calibre/src/calibre/library/database2.py:1599
|
||||
#: /home/kovid/work/calibre/src/calibre/library/database2.py:1622
|
||||
#: /home/kovid/work/calibre/src/calibre/library/database2.py:1673
|
||||
#: /home/kovid/work/calibre/src/calibre/library/database2.py:1621
|
||||
#: /home/kovid/work/calibre/src/calibre/library/database2.py:1672
|
||||
#: /home/kovid/work/calibre/src/calibre/library/server.py:294
|
||||
#: /home/kovid/work/calibre/src/calibre/library/server.py:355
|
||||
#: /home/kovid/work/calibre/src/calibre/utils/podofo/__init__.py:45
|
||||
@ -1384,6 +1384,11 @@ msgid ""
|
||||
"Fetch a cover image for the book identified by ISBN from LibraryThing.com\n"
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/opf2.py:1053
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1260
|
||||
msgid "Cover"
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/mobi/output.py:22
|
||||
msgid "Modify images to meet Palm device size limitations."
|
||||
msgstr ""
|
||||
@ -1405,14 +1410,10 @@ msgstr ""
|
||||
msgid "Disable compression of the file contents."
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/mobi/output.py:101
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/mobi/output.py:103
|
||||
msgid "All articles"
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1260
|
||||
msgid "Cover"
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1261
|
||||
msgid "Title Page"
|
||||
msgstr ""
|
||||
@ -3662,50 +3663,50 @@ msgstr ""
|
||||
msgid "The cover in the %s format is invalid"
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:402
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:405
|
||||
msgid "Downloading cover..."
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:414
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:419
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:425
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:417
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:422
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:428
|
||||
msgid "Cannot fetch cover"
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:415
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:426
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:418
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:429
|
||||
msgid "<b>Could not fetch cover.</b><br/>"
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:416
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:419
|
||||
msgid "The download timed out."
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:420
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:423
|
||||
msgid "Could not find cover for this book. Try specifying the ISBN first."
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:432
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:435
|
||||
msgid "Bad cover"
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:433
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:436
|
||||
msgid "The cover is not a valid picture"
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:472
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:475
|
||||
msgid "Cannot fetch metadata"
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:473
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:476
|
||||
msgid "You must specify at least one of ISBN, Title, Authors or Publisher"
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:499
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:502
|
||||
msgid "Permission denied"
|
||||
msgstr ""
|
||||
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:500
|
||||
#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:503
|
||||
msgid "Could not open %s. Is it being used by another program?"
|
||||
msgstr ""
|
||||
|
||||
|
@ -102,3 +102,5 @@ class Log(object):
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
self.prints(INFO, *args, **kwargs)
|
||||
|
||||
default_log = Log()
|
||||
|
@ -52,7 +52,7 @@ recipe_modules = ['recipe_' + r for r in (
|
||||
'diagonales', 'miradasalsur', 'newsweek_argentina', 'veintitres',
|
||||
'gva_be', 'hln', 'tijd', 'degentenaar', 'inquirer_net', 'uncrate',
|
||||
'fastcompany', 'accountancyage', 'laprensa_hn', 'latribuna',
|
||||
'eltiempo_hn',
|
||||
'eltiempo_hn', 'slate',
|
||||
)]
|
||||
|
||||
|
||||
|
@ -12,26 +12,27 @@ class AlJazeera(BasicNewsRecipe):
|
||||
title = 'Al Jazeera in English'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'News from Middle East'
|
||||
language = _('English')
|
||||
publisher = 'Al Jazeera'
|
||||
category = 'news, politics, middle east'
|
||||
simultaneous_downloads = 1
|
||||
delay = 4
|
||||
delay = 4
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
encoding = 'iso-8859-1'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment', description
|
||||
, '--category', category
|
||||
, '--publisher', publisher
|
||||
, '--ignore-tables'
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_table=True'
|
||||
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_table=True'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'ctl00_divContent'})]
|
||||
|
||||
remove_tags = [
|
||||
|
@ -12,9 +12,10 @@ class Azstarnet(BasicNewsRecipe):
|
||||
title = 'Arizona Daily Star'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'news from Arizona'
|
||||
language = _('English')
|
||||
publisher = 'azstarnet.com'
|
||||
category = 'news, politics, Arizona, USA'
|
||||
delay = 1
|
||||
delay = 1
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
@ -28,8 +29,8 @@ class Azstarnet(BasicNewsRecipe):
|
||||
, '--category', category
|
||||
, '--publisher', publisher
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
@ -40,8 +41,8 @@ class Azstarnet(BasicNewsRecipe):
|
||||
br['pass' ] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
|
||||
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'storycontent'})]
|
||||
|
||||
remove_tags = [
|
||||
@ -49,15 +50,15 @@ class Azstarnet(BasicNewsRecipe):
|
||||
,dict(name='div',attrs={'class':'bannerinstory'})
|
||||
]
|
||||
|
||||
|
||||
|
||||
feeds = [(u'Tucson Region', u'http://rss.azstarnet.com/index.php?site=metro')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup.html['dir' ] = 'ltr'
|
||||
soup.html['lang'] = 'en-US'
|
||||
mtag = '\n<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
|
||||
soup.head.insert(0,mtag)
|
||||
soup.head.insert(0,mtag)
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
|
||||
|
@ -14,6 +14,7 @@ class CodingHorror(BasicNewsRecipe):
|
||||
description = 'programming and human factors - Jeff Atwood'
|
||||
category = 'blog, programming'
|
||||
publisher = 'Jeff Atwood'
|
||||
language = _('English')
|
||||
author = 'Jeff Atwood'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
|
@ -16,6 +16,7 @@ class Sueddeutsche(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
timefmt = ' [%a %d %b %Y]'
|
||||
max_articles_per_feed = 50
|
||||
language = _('English')
|
||||
no_stylesheets = True
|
||||
html2epub_options = 'linearize_tables = True\nbase_font_size2=14'
|
||||
html2lrf_options = ['--ignore-tables']
|
||||
|
@ -11,25 +11,26 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class MoneyNews(BasicNewsRecipe):
|
||||
title = 'Moneynews.com'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Financial news worldwide'
|
||||
description = 'Financial news worldwide'
|
||||
publisher = 'moneynews.com'
|
||||
category = 'news, finances, USA, business'
|
||||
language = _('English')
|
||||
category = 'news, finances, USA, business'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'cp1252'
|
||||
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment', description
|
||||
, '--category', category
|
||||
, '--publisher', publisher
|
||||
, '--ignore-tables'
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
|
||||
|
||||
feeds = [
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
|
||||
|
||||
feeds = [
|
||||
(u'Street Talk' , u'http://moneynews.newsmax.com/xml/streettalk.xml' )
|
||||
,(u'Finance News' , u'http://moneynews.newsmax.com/xml/FinanceNews.xml' )
|
||||
,(u'Economy' , u'http://moneynews.newsmax.com/xml/economy.xml' )
|
||||
@ -38,12 +39,12 @@ class MoneyNews(BasicNewsRecipe):
|
||||
,(u'Investing & Analysis' , u'http://moneynews.newsmax.com/xml/investing.xml' )
|
||||
]
|
||||
|
||||
|
||||
|
||||
keep_only_tags = [dict(name='table', attrs={'class':'copy'})]
|
||||
|
||||
|
||||
remove_tags = [
|
||||
dict(name='td' , attrs={'id':'article_fontsize'})
|
||||
,dict(name='table', attrs={'id':'toolbox' })
|
||||
,dict(name='tr' , attrs={'id':'noprint3' })
|
||||
]
|
||||
|
||||
|
||||
|
@ -17,7 +17,7 @@ class Publico(BasicNewsRecipe):
|
||||
max_articles_per_feed = 30
|
||||
encoding='utf-8'
|
||||
no_stylesheets = True
|
||||
language = _('Portuguese')
|
||||
language = _('Portugese')
|
||||
preprocess_regexps = [(re.compile(u"\uFFFD", re.DOTALL|re.IGNORECASE), lambda match: ''),]
|
||||
|
||||
feeds = [
|
||||
|
@ -15,6 +15,7 @@ class ScottHanselman(BasicNewsRecipe):
|
||||
category = "Scott, Computer, Zen, .NET, C#, Hanselman, Scott, Weblog, Diabetes, Portland, Zimbabwe, ComputerZen.com - Scott Hanselman's Musings"
|
||||
publisher = 'Scott Hanselman'
|
||||
author = 'Scott Hanselman'
|
||||
language = _('English')
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
|
330
src/calibre/web/feeds/recipes/recipe_slate.py
Normal file
330
src/calibre/web/feeds/recipes/recipe_slate.py
Normal file
@ -0,0 +1,330 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Fetches the last 7 days of featured articles from slate.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
||||
|
||||
class Slate(BasicNewsRecipe):
|
||||
# Method variables for customizing downloads
|
||||
title = 'Slate'
|
||||
description = 'A daily magazine on the Web, offering analysis and commentary about politics, news and culture.'
|
||||
__author__ = 'GRiker@hotmail.com'
|
||||
language = _('English')
|
||||
max_articles_per_feed = 40
|
||||
oldest_article = 7.0
|
||||
recursions = 0
|
||||
delay = 0
|
||||
simultaneous_downloads = 5
|
||||
timeout = 120.0
|
||||
timefmt = ''
|
||||
feeds = None
|
||||
no_stylesheets = True
|
||||
encoding = None
|
||||
|
||||
# Method variables for customizing feed parsing
|
||||
summary_length = 250
|
||||
use_embedded_content = None
|
||||
|
||||
# Method variables for pre/post processing of HTML
|
||||
remove_tags = [ dict(name=['link','style']),
|
||||
dict(id=['toolbox','site_navigation','article_bottom_tools_cntr',
|
||||
'article_bottom_tools','recommend_tab2','bottom_sponsored_links',
|
||||
'fray_article_discussion','bizbox_sponsored_links_bottom',
|
||||
'page_rightcol','top_banner','also_in_slate_bottom','articlefooter',
|
||||
'article_top_wedge','content-top','page-title',
|
||||
'block-today039s-business-press-archives','block-blog-roll',
|
||||
'block-also-in-tbm','block-most-popular-on-tbm','block-the-best-of-tbm',
|
||||
'service-links-bottom','comments','ft']),
|
||||
dict(attrs={'class':['fray_article_links','clearing','nav',
|
||||
'service-links service-links-stack','yui-b last',
|
||||
'read-more-comments']})]
|
||||
extra_css = '.headline {text-align:left;}\n\
|
||||
.byline {font:monospace; text-align:left; margin-bottom:0pt;}\n\
|
||||
.dateline {text-align:left; height:0pt;}\n\
|
||||
.source {align:left;}\n\
|
||||
.credit {text-align:right;font-size:smaller;}\n'
|
||||
|
||||
baseURL = 'http://slate.com'
|
||||
section_dates = []
|
||||
|
||||
def tag_to_strings(self, tag):
|
||||
if not tag:
|
||||
return ''
|
||||
if isinstance(tag, basestring):
|
||||
return tag
|
||||
strings = []
|
||||
for item in tag.contents:
|
||||
if isinstance(item, (NavigableString, CData)):
|
||||
strings.append(item.string)
|
||||
elif isinstance(item, Tag):
|
||||
res = self.tag_to_string(item)
|
||||
if res:
|
||||
strings.append(res)
|
||||
return strings
|
||||
|
||||
def extract_sections(self):
|
||||
soup = self.index_to_soup( self.baseURL )
|
||||
|
||||
soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
|
||||
soup = soup.find(True, attrs={'id':'toc_links_container'})
|
||||
|
||||
todays_section = soup.find(True, attrs={'class':'todaydateline'})
|
||||
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
|
||||
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
|
||||
|
||||
older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
|
||||
for older_section in older_section_dates :
|
||||
self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
|
||||
|
||||
headline_stories = soup_top_stories.find('ul')
|
||||
section_lists = soup.findAll('ul')
|
||||
# Prepend the headlines to the first section
|
||||
section_lists[0].insert(0,headline_stories)
|
||||
|
||||
sections = []
|
||||
for section in section_lists :
|
||||
sections.append(section)
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
def extract_section_articles(self, sections_html) :
|
||||
soup = self.index_to_soup(str(sections_html))
|
||||
sections = soup.findAll('ul')
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
|
||||
for (i,section) in enumerate(sections) :
|
||||
|
||||
# Get the section name
|
||||
if section.has_key('id') :
|
||||
key = self.section_dates[i]
|
||||
articles[key] = []
|
||||
ans.append(key)
|
||||
else :
|
||||
continue
|
||||
|
||||
# Get the section article_list
|
||||
article_list = section.findAll('li')
|
||||
|
||||
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
|
||||
excludedTitleKeywords = ['Gabfest','Slate V']
|
||||
excludedAuthorKeywords = ['Prudence']
|
||||
|
||||
# Extract the article attributes
|
||||
for article in article_list :
|
||||
bylines = self.tag_to_strings(article)
|
||||
url = article.a['href']
|
||||
title = bylines[0]
|
||||
full_title = self.tag_to_string(article)
|
||||
|
||||
author = None
|
||||
description = None
|
||||
pubdate = None
|
||||
|
||||
if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 :
|
||||
description = "A summary of what's in the major U.S. newspapers."
|
||||
|
||||
if len(bylines) == 3 :
|
||||
author = bylines[2].strip()
|
||||
author = re.sub('[\r][\n][\t][\t\t]','', author)
|
||||
author = re.sub(',','', author)
|
||||
if bylines[1] is not None :
|
||||
description = bylines[1]
|
||||
full_byline = self.tag_to_string(article)
|
||||
if full_byline.find('major U.S. newspapers') > 0 :
|
||||
description = "A summary of what's in the major U.S. newspapers."
|
||||
|
||||
|
||||
if len(bylines) > 3 and author is not None:
|
||||
author += " | "
|
||||
for (i,substring) in enumerate(bylines[3:]) :
|
||||
#print "substring: %s" % substring.encode('cp1252')
|
||||
author += substring.strip()
|
||||
if i < len(bylines[3:]) :
|
||||
author += " | "
|
||||
|
||||
# Skip articles whose descriptions contain excluded keywords
|
||||
if description is not None :
|
||||
excluded = re.compile('|'.join(excludedDescriptionKeywords))
|
||||
found_excluded = excluded.search(description)
|
||||
if found_excluded :
|
||||
continue
|
||||
|
||||
# Skip articles whose title contain excluded keywords
|
||||
if full_title is not None :
|
||||
excluded = re.compile('|'.join(excludedTitleKeywords))
|
||||
#self.log("evaluating full_title: %s" % full_title)
|
||||
found_excluded = excluded.search(full_title)
|
||||
if found_excluded :
|
||||
continue
|
||||
|
||||
# Skip articles whose author contain excluded keywords
|
||||
if author is not None :
|
||||
excluded = re.compile('|'.join(excludedAuthorKeywords))
|
||||
found_excluded = excluded.search(author)
|
||||
if found_excluded :
|
||||
continue
|
||||
|
||||
skip_this_article = False
|
||||
# Check to make sure we're not adding a duplicate
|
||||
for article in articles[key] :
|
||||
if article['url'] == url :
|
||||
skip_this_article = True
|
||||
break
|
||||
|
||||
if skip_this_article :
|
||||
continue
|
||||
|
||||
# Build the dictionary entry for this article
|
||||
feed = key
|
||||
if not articles.has_key(feed) :
|
||||
articles[feed] = []
|
||||
articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
|
||||
author=author, content=''))
|
||||
# Promote 'newspapers' to top
|
||||
for (i,article) in enumerate(articles[feed]) :
|
||||
if article['description'] is not None :
|
||||
if article['description'].find('newspapers') > 0 :
|
||||
articles[feed].insert(0,articles[feed].pop(i))
|
||||
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
ans = self.remove_duplicates(ans)
|
||||
return ans
|
||||
|
||||
def flatten_document(self, ans):
|
||||
flat_articles = []
|
||||
for (i,section) in enumerate(ans) :
|
||||
for article in section[1] :
|
||||
flat_articles.append(article)
|
||||
flat_section = ['All Articles', flat_articles]
|
||||
flat_ans = [flat_section]
|
||||
|
||||
return flat_ans
|
||||
|
||||
def remove_duplicates(self, ans):
|
||||
for (i,section) in enumerate(ans) :
|
||||
for article in section[1] :
|
||||
for (j,subsequent_section) in enumerate(ans[i+1:]) :
|
||||
for (k,subsequent_article) in enumerate(subsequent_section[1]) :
|
||||
if article['url'] == subsequent_article['url'] :
|
||||
del subsequent_section[1][k]
|
||||
return ans
|
||||
|
||||
def print_version(self, url) :
|
||||
return url + 'pagenum/all/'
|
||||
|
||||
# Class methods
|
||||
def parse_index(self) :
|
||||
sections = self.extract_sections()
|
||||
section_list = self.extract_section_articles(sections)
|
||||
section_list = self.flatten_document(section_list)
|
||||
return section_list
|
||||
|
||||
|
||||
def postprocess_html(self, soup, first_fetch) :
|
||||
# Fix up dept_kicker as <h3><em>
|
||||
dept_kicker = soup.find(True, attrs={'class':'department_kicker'})
|
||||
if dept_kicker is not None :
|
||||
kicker_strings = self.tag_to_strings(dept_kicker)
|
||||
kicker = kicker_strings[2] + kicker_strings[3]
|
||||
kicker = re.sub('.','',kicker)
|
||||
h3Tag = Tag(soup, "h3")
|
||||
emTag = Tag(soup, "em")
|
||||
h3Tag.insert(0, emTag)
|
||||
emTag.insert(0,kicker)
|
||||
dept_kicker.replaceWith(h3Tag)
|
||||
|
||||
# Change <h1> to <h2>
|
||||
headline = soup.find("h1")
|
||||
if headline is not None :
|
||||
h2tag = Tag(soup, "h2")
|
||||
h2tag['class'] = "headline"
|
||||
strs = self.tag_to_strings(headline)
|
||||
result = ''
|
||||
for (i,substr) in enumerate(strs) :
|
||||
result += substr
|
||||
if i < len(strs) -1 :
|
||||
result += '<br />'
|
||||
h2tag.insert(0, result)
|
||||
headline.replaceWith(h2tag)
|
||||
|
||||
# Fix up the concatenated byline and dateline
|
||||
byline = soup.find(True,attrs={'class':'byline'})
|
||||
if byline is not None :
|
||||
bylineTag = Tag(soup,'div')
|
||||
bylineTag['class'] = 'byline'
|
||||
bylineTag.insert(0,self.tag_to_string(byline))
|
||||
byline.replaceWith(bylineTag)
|
||||
|
||||
dateline = soup.find(True, attrs={'class':'dateline'})
|
||||
if dateline is not None :
|
||||
datelineTag = Tag(soup, 'div')
|
||||
datelineTag['class'] = 'dateline'
|
||||
datelineTag.insert(0,self.tag_to_string(dateline))
|
||||
dateline.replaceWith(datelineTag)
|
||||
|
||||
# Change captions to italic, add <hr>
|
||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||
if caption is not None:
|
||||
emTag = Tag(soup, "em")
|
||||
emTag.insert(0, '<br />' + self.tag_to_string(caption))
|
||||
hrTag = Tag(soup, 'hr')
|
||||
emTag.insert(1, hrTag)
|
||||
caption.replaceWith(emTag)
|
||||
|
||||
return soup
|
||||
|
||||
def postprocess_book(self, oeb, opts, log) :
|
||||
|
||||
def extract_byline(href) :
|
||||
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
|
||||
byline = soup.find(True,attrs={'class':'byline'})
|
||||
if byline is not None:
|
||||
return self.tag_to_string(byline,use_alt=False)
|
||||
else :
|
||||
return None
|
||||
|
||||
def extract_description(href) :
|
||||
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
|
||||
paragraphs = soup.findAll('p')
|
||||
for p in paragraphs :
|
||||
if self.tag_to_string(p,use_alt=False).startswith('By ') or \
|
||||
self.tag_to_string(p,use_alt=False).startswith('Posted '):
|
||||
continue
|
||||
|
||||
images = p.findAll(True, attrs={'class':'imagewrapper'})
|
||||
for image in images :
|
||||
image.extract()
|
||||
return self.tag_to_string(p,use_alt=False)[:200] + '...'
|
||||
|
||||
return None
|
||||
|
||||
if oeb.toc.depth() == 2 :
|
||||
for article in oeb.toc :
|
||||
if article.author is None :
|
||||
article.author = extract_byline(article.href)
|
||||
|
||||
if article.description is None :
|
||||
article.description = extract_description(article.href)
|
||||
|
||||
|
||||
elif oeb.toc.depth() == 3 :
|
||||
for section in oeb.toc :
|
||||
for article in section :
|
||||
if article.author is None :
|
||||
article.author = extract_byline(article.href)
|
||||
|
||||
if article.description is None :
|
||||
article.description = extract_description(article.href)
|
||||
|
||||
|
||||
|
@ -15,6 +15,7 @@ class StackOverflowBlog(BasicNewsRecipe):
|
||||
category = 'blog, programming'
|
||||
publisher = 'StackOverflow team'
|
||||
oldest_article = 30
|
||||
language = _('English')
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = True
|
||||
|
@ -2,36 +2,25 @@
|
||||
__license__ = 'GPL v3'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Wired(BasicNewsRecipe):
|
||||
|
||||
|
||||
title = 'Wired.com'
|
||||
__author__ = 'David Chen <SonyReader<at>DaveChen<dot>org>'
|
||||
__author__ = 'Kovid Goyal'
|
||||
description = 'Technology news'
|
||||
timefmt = ' [%Y%b%d %H%M]'
|
||||
language = _('English')
|
||||
no_stylesheets = True
|
||||
#html2lrf_options = ['--base-font-size', '16']
|
||||
|
||||
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
|
||||
[
|
||||
|
||||
## Remove any banners/links/ads/cruft before the body of the article.
|
||||
(r'<body.*?((<div id="article_body">)|(<div id="st-page-maincontent">)|(<div id="containermain">)|(<p class="ap-story-p">)|(<!-- img_nav -->))', lambda match: '<body><div>'),
|
||||
|
||||
## Remove any links/ads/comments/cruft from the end of the body of the article.
|
||||
(r'((<!-- end article content -->)|(<div id="st-custom-afterpagecontent">)|(<p class="ap-story-p">©)|(<div class="entry-footer">)|(<div id="see_also">)|(<p>Via <a href=)|(<div id="ss_nav">)).*?</html>', lambda match : '</div></body></html>'),
|
||||
|
||||
## Correctly embed in-line images by removing the surrounding javascript that will be ignored in the conversion
|
||||
(r'<a.*?onclick.*?>.*?(<img .*?>)', lambda match: match.group(1),),
|
||||
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
remove_tags_before = dict(name='div', id='content')
|
||||
remove_tags = [dict(id=['social_tools', 'outerWrapper', 'sidebar',
|
||||
'footer', 'advertisement', 'blog_subscription_unit',
|
||||
'brightcove_component']),
|
||||
{'class':'entryActions'},
|
||||
dict(name=['noscript', 'script'])]
|
||||
|
||||
feeds = [
|
||||
('Top News', 'http://feeds.wired.com/wired/index'),
|
||||
('Culture', 'http://feeds.wired.com/wired/culture'),
|
||||
@ -47,8 +36,8 @@ class Wired(BasicNewsRecipe):
|
||||
('Tech Biz', 'http://feeds.wired.com/wired/techbiz'),
|
||||
('Commentary', 'http://feeds.wired.com/wired/commentary'),
|
||||
]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('http://www.wired.com/', 'http://www.wired.com/print/')
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user