diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index a9cf7c7045..3346f205b8 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -361,6 +361,8 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252'): return '&'+ent+';' if ent == 'apos': return "'" + if ent == 'hellips': + ent = 'hellip' if ent.startswith(u'#x'): num = int(ent[2:], 16) if encoding is None or num > 255: @@ -382,6 +384,15 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252'): except KeyError: return '&'+ent+';' +_ent_pat = re.compile(r'&(\S+);') + +def prepare_string_for_xml(raw, attribute=False): + raw = _ent_pat.sub(entity_to_unicode, raw) + raw = raw.replace('&', '&').replace('<', '<').replace('>', '>') + if attribute: + raw = raw.replace('"', '"').replace("'", ''') + return raw + if isosx: fdir = os.path.expanduser('~/.fonts') try: diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 3a6da541cb..7e38f3c47f 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -2,7 +2,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' __appname__ = 'calibre' -__version__ = '0.6.0b16' +__version__ = '0.6.0b17' __author__ = "Kovid Goyal " import re diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py index dcd5604aa8..4034a8810b 100644 --- a/src/calibre/ebooks/__init__.py +++ b/src/calibre/ebooks/__init__.py @@ -57,6 +57,35 @@ class HTMLRenderer(object): self.loop.exit(0) +def extract_cover_from_embedded_svg(html, base, log): + from lxml import etree + from calibre.ebooks.oeb.base import XPath, SVG, XLINK + root = etree.fromstring(html) + + svg = XPath('//svg:svg')(root) + if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'): + image = svg[0][0] + href = image.get(XLINK('href'), None) + path = os.path.join(base, *href.split('/')) + if href and os.access(path, os.R_OK): + return open(path, 'rb').read() + +def render_html_svg_workaround(path_to_html, log, width=590, height=750): + from calibre.ebooks.oeb.base import SVG_NS + raw = open(path_to_html, 'rb').read() + data = None + if SVG_NS in raw: + try: + data = extract_cover_from_embedded_svg(raw, + os.path.dirname(path_to_html), log) + except: + pass + if data is None: + renderer = render_html(path_to_html, width, height) + data = getattr(renderer, 'data', None) + return data + + def render_html(path_to_html, width=590, height=750): from PyQt4.QtWebKit import QWebPage from PyQt4.Qt import QEventLoop, QPalette, Qt, SIGNAL, QUrl, QSize diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py index dc2aa230d5..3f8b563d96 100644 --- a/src/calibre/ebooks/epub/input.py +++ b/src/calibre/ebooks/epub/input.py @@ -54,7 +54,7 @@ class EPUBInput(InputFormatPlugin): return False @classmethod - def rationalize_cover(self, opf): + def rationalize_cover(self, opf, log): guide_cover, guide_elem = None, None for guide_elem in opf.iterguide(): if guide_elem.get('type', '').lower() == 'cover': @@ -65,28 +65,37 @@ class EPUBInput(InputFormatPlugin): spine = list(opf.iterspine()) if not spine: return + # Check if the cover specified in the guide is also + # the first element in spine idref = spine[0].get('idref', '') manifest = list(opf.itermanifest()) if not manifest: return - if manifest[0].get('id', False) != idref: + elem = [x for x in manifest if x.get('id', '') == idref] + if not elem or elem[0].get('href', None) != guide_cover: return + log('Found HTML cover', guide_cover) + + # Remove from spine as covers must be treated + # specially spine[0].getparent().remove(spine[0]) guide_elem.set('href', 'calibre_raster_cover.jpg') + from calibre.ebooks.oeb.base import OPF + t = etree.SubElement(elem[0].getparent(), OPF('item'), + href=guide_elem.get('href'), id='calibre_raster_cover') + t.set('media-type', 'image/jpeg') for elem in list(opf.iterguide()): if elem.get('type', '').lower() == 'titlepage': elem.getparent().remove(elem) - from calibre.ebooks.oeb.base import OPF t = etree.SubElement(guide_elem.getparent(), OPF('reference')) t.set('type', 'titlepage') t.set('href', guide_cover) t.set('title', 'Title Page') - from calibre.ebooks import render_html - renderer = render_html(guide_cover) + from calibre.ebooks import render_html_svg_workaround + renderer = render_html_svg_workaround(guide_cover, log) if renderer is not None: open('calibre_raster_cover.jpg', 'wb').write( - renderer.data) - + renderer) def convert(self, stream, options, file_ext, log, accelerators): from calibre.utils.zipfile import ZipFile @@ -121,7 +130,7 @@ class EPUBInput(InputFormatPlugin): for elem in opf.iterguide(): elem.set('href', delta+elem.get('href')) - self.rationalize_cover(opf) + self.rationalize_cover(opf, log) with open('content.opf', 'wb') as nopf: nopf.write(opf.render()) diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index be096eece3..6e09c7e6d9 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -12,7 +12,7 @@ from urllib import unquote from calibre.customize.conversion import OutputFormatPlugin from calibre.ptempfile import TemporaryDirectory from calibre.constants import __appname__, __version__ -from calibre import strftime, guess_type +from calibre import strftime, guess_type, prepare_string_for_xml from calibre.customize.conversion import OptionRecommendation from lxml import etree @@ -210,6 +210,7 @@ class EPUBOutput(OutputFormatPlugin): id, href = self.oeb.manifest.generate('calibre-logo', 'calibre-logo.png') self.oeb.manifest.add(id, href, 'image/png', data=img_data) + title, author = map(prepare_string_for_xml, (title, author)) html = self.TITLEPAGE%dict(title=title, author=author, date=strftime('%d %b, %Y'), app=__appname__ +' '+__version__, diff --git a/src/calibre/ebooks/metadata/epub.py b/src/calibre/ebooks/metadata/epub.py index 040fee78a4..9fd8bf44e9 100644 --- a/src/calibre/ebooks/metadata/epub.py +++ b/src/calibre/ebooks/metadata/epub.py @@ -5,14 +5,10 @@ __copyright__ = '2008, Kovid Goyal ' '''Read meta information from epub files''' -import os, time +import os from cStringIO import StringIO from contextlib import closing -from PyQt4.Qt import QUrl, QEventLoop, QSize, QByteArray, QBuffer, \ - SIGNAL, QPainter, QImage, QObject, QApplication, Qt, QPalette -from PyQt4.QtWebKit import QWebPage - from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup from calibre.ebooks.metadata import MetaInformation @@ -102,64 +98,9 @@ class OCFDirReader(OCFReader): def open(self, path, *args, **kwargs): return open(os.path.join(self.root, path), *args, **kwargs) -class CoverRenderer(QObject): - WIDTH = 600 - HEIGHT = 800 - - def __init__(self, path): - if QApplication.instance() is None: - QApplication([]) - QObject.__init__(self) - self.loop = QEventLoop() - self.page = QWebPage() - pal = self.page.palette() - pal.setBrush(QPalette.Background, Qt.white) - self.page.setPalette(pal) - self.page.setViewportSize(QSize(self.WIDTH, self.HEIGHT)) - self.page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) - self.page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) - QObject.connect(self.page, SIGNAL('loadFinished(bool)'), self.render_html) - self._image_data = None - self.rendered = False - url = QUrl.fromLocalFile(os.path.normpath(path)) - self.page.mainFrame().load(url) - - def render_html(self, ok): - try: - if not ok: - self.rendered = True - return - image = QImage(self.page.viewportSize(), QImage.Format_ARGB32) - image.setDotsPerMeterX(96*(100/2.54)) - image.setDotsPerMeterY(96*(100/2.54)) - painter = QPainter(image) - self.page.mainFrame().render(painter) - painter.end() - ba = QByteArray() - buf = QBuffer(ba) - buf.open(QBuffer.WriteOnly) - image.save(buf, 'JPEG') - self._image_data = str(ba.data()) - finally: - self.loop.exit(0) - self.rendered = True - - def image_data(): - def fget(self): - if not self.rendered: - self.loop.exec_() - count = 0 - while count < 50 and not self.rendered: - time.sleep(0.1) - count += 1 - return self._image_data - return property(fget=fget) - image_data = image_data() - - def get_cover(opf, opf_path, stream): - from calibre.gui2 import is_ok_to_use_qt - if not is_ok_to_use_qt(): return None + from calibre.ebooks import render_html_svg_workaround + from calibre.utils.logging import default_log spine = list(opf.spine_items()) if not spine: return @@ -172,8 +113,7 @@ def get_cover(opf, opf_path, stream): cpage = os.path.join(tdir, os.path.dirname(opf_path), cpage) if not os.path.exists(cpage): return - cr = CoverRenderer(cpage) - return cr.image_data + return render_html_svg_workaround(cpage, default_log) def get_metadata(stream, extract_cover=True): """ Return metadata as a :class:`MetaInformation` object """ diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index b5c1a14475..3d46668ee9 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -443,7 +443,7 @@ class MobiReader(object): self.processed_html = '

' + self.processed_html.replace('\n\n', '

') + '' self.processed_html = self.processed_html.replace('\r\n', '\n') self.processed_html = self.processed_html.replace('> <', '>\n<') - self.processed_html = re.sub('\x14|\x15|\x1c|\x1d', '', self.processed_html) + self.processed_html = re.sub('\x14|\x15|\x1c|\x1d|\xef|\x12|\x13|\xec', '', self.processed_html) def ensure_unit(self, raw, unit='px'): if re.search(r'\d+$', raw) is not None: diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index cdfe58df35..0a305dbe1c 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -1556,7 +1556,8 @@ class MobiWriter(object): else: raise NotImplementedError("missing date or timestamp needed for mobi_periodical") - if oeb.metadata.cover: + if oeb.metadata.cover and \ + unicode(oeb.metadata.cover[0]) in oeb.manifest.ids: id = unicode(oeb.metadata.cover[0]) item = oeb.manifest.ids[id] href = item.href @@ -2028,7 +2029,7 @@ class MobiWriter(object): indices.write(pack('>H', pos)) # Save the offset for IDXTIndices name = "%04X"%count indxt.write(chr(len(name)) + name) # Write the name - indxt.write(INDXT['periodical']) # entryType [0x0F | 0xDF | 0xFF | 0x3F] + indxt.write(INDXT['periodical']) # entryType [0x0F | 0xDF | 0xFF | 0x3F] indxt.write(chr(1)) # subType 1 indxt.write(decint(offset, DECINT_FORWARD)) # offset indxt.write(decint(length, DECINT_FORWARD)) # length diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index d044be24b6..03d45a3dad 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -759,6 +759,15 @@ class Manifest(object): return u'Item(id=%r, href=%r, media_type=%r)' \ % (self.id, self.href, self.media_type) + def _parse_xml(self, data): + try: + return etree.fromstring(data) + except etree.XMLSyntaxError, err: + if getattr(err, 'code', 0) == 26 or str(err).startswith('Entity'): + data = xml_to_unicode(data, strip_encoding_pats=True, + resolve_entities=True)[0] + return etree.fromstring(data) + def _parse_xhtml(self, data): self.oeb.log.debug('Parsing', self.href, '...') # Convert to Unicode and normalize line endings @@ -952,7 +961,7 @@ class Manifest(object): elif self.media_type.lower() in OEB_DOCS: data = self._parse_xhtml(data) elif self.media_type.lower()[-4:] in ('+xml', '/xml'): - data = etree.fromstring(data) + data = self._parse_xml(data) elif self.media_type.lower() in OEB_STYLES: data = self._parse_css(data) elif 'text' in self.media_type.lower(): diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 6b246d0580..03c878b9d2 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -27,7 +27,6 @@ from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \ OEBError, OEBBook, DirContainer from calibre.ebooks.oeb.writer import OEBWriter from calibre.ebooks.oeb.entitydefs import ENTITYDEFS -from calibre.ebooks.metadata.epub import CoverRenderer from calibre.startup import get_lang from calibre.ptempfile import TemporaryDirectory from calibre.constants import __appname__, __version__ @@ -343,8 +342,11 @@ class OEBReader(object): descriptionElement = xpath(child, 'descendant::calibre:meta[@name = "description"]') - if descriptionElement : - description = descriptionElement[0].text + if descriptionElement: + description = etree.tostring(descriptionElement[0], + method='text', encoding=unicode).strip() + if not description: + description = None else : description = None @@ -524,12 +526,14 @@ class OEBReader(object): return def _cover_from_html(self, hcover): + from calibre.ebooks import render_html_svg_workaround with TemporaryDirectory('_html_cover') as tdir: writer = OEBWriter() writer(self.oeb, tdir) path = os.path.join(tdir, urlunquote(hcover.href)) - renderer = CoverRenderer(path) - data = renderer.image_data + data = render_html_svg_workaround(path, self.logger) + if not data: + data = '' id, href = self.oeb.manifest.generate('cover', 'cover.jpeg') item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) return item diff --git a/src/calibre/ebooks/oeb/transforms/metadata.py b/src/calibre/ebooks/oeb/transforms/metadata.py index 837769bc0d..96698b2db6 100644 --- a/src/calibre/ebooks/oeb/transforms/metadata.py +++ b/src/calibre/ebooks/oeb/transforms/metadata.py @@ -97,6 +97,8 @@ class MergeMetadata(object): id = old_cover = None if 'cover' in self.oeb.guide: old_cover = self.oeb.guide['cover'] + if prefer_metadata_cover and old_cover is not None: + cdata = '' if cdata: self.oeb.guide.remove('cover') self.oeb.guide.remove('titlepage') @@ -106,6 +108,10 @@ class MergeMetadata(object): if not cdata: return item.id self.oeb.manifest.remove(item) + elif not cdata: + id = self.oeb.manifest.generate(id='cover') + self.oeb.manifest.add(id, old_cover.href, 'image/jpeg') + return id if cdata: id, href = self.oeb.manifest.generate('cover', 'cover.jpg') self.oeb.manifest.add(id, href, 'image/jpeg', data=cdata) diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index d4b60e3a59..d8d750eade 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -301,30 +301,26 @@ class FlowSplitter(object): # Tree 1 hit_split_point = False - for elem in list(body.iterdescendants(etree.Element)): + for elem in list(body.iterdescendants()): if elem is split_point: hit_split_point = True if before: - x = elem.get('id', None) nix_element(elem) continue if hit_split_point: - x = elem.get('id', None) nix_element(elem) # Tree 2 hit_split_point = False - for elem in list(body2.iterdescendants(etree.Element)): + for elem in list(body2.iterdescendants()): if elem is split_point2: hit_split_point = True if not before: - x = elem.get('id', None) nix_element(elem, top=False) continue if not hit_split_point: - x = elem.get('id', None) nix_element(elem, top=False) body2.text = '\n' diff --git a/src/calibre/gui2/dialogs/metadata_bulk.ui b/src/calibre/gui2/dialogs/metadata_bulk.ui index 05cb9cf12c..f3cf8ab251 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.ui +++ b/src/calibre/gui2/dialogs/metadata_bulk.ui @@ -53,7 +53,7 @@ - Author S&ort: + Author s&ort: Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter @@ -118,7 +118,7 @@ - Add Ta&gs: + Add ta&gs: Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 59f1a5f2e3..10a81be9f5 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -392,7 +392,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.tags.update_tags_cache(self.db.all_tags()) def fetch_cover(self): - isbn = unicode(self.isbn.text()).strip() + isbn = re.sub(r'[^0-9a-zA-Z]', '', unicode(self.isbn.text())).strip() self.fetch_cover_button.setEnabled(False) self.setCursor(Qt.WaitCursor) title, author = map(unicode, (self.title.text(), self.authors.text())) @@ -510,7 +510,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): aus = qstring_to_unicode(self.author_sort.text()) if aus: self.db.set_author_sort(self.id, aus, notify=False) - self.db.set_isbn(self.id, qstring_to_unicode(self.isbn.text()), notify=False) + self.db.set_isbn(self.id, + re.sub(r'[^0-9a-zA-Z]', '', unicode(self.isbn.text())), notify=False) self.db.set_rating(self.id, 2*self.rating.value(), notify=False) self.db.set_publisher(self.id, qstring_to_unicode(self.publisher.currentText()), notify=False) self.db.set_tags(self.id, qstring_to_unicode(self.tags.text()).split(','), notify=False) diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index 15e12d315e..b4b8494c64 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -1873,13 +1873,19 @@ def main(args=sys.argv): return run_gui(opts, args, actions, listener, app) else: return run_gui(opts, args, actions, listener, app) + otherinstance = False try: listener = Listener(address=ADDRESS) - except socket.error: # Good si is correct - communicate(args) + except socket.error: # Good si is correct (on UNIX) + otherinstance = True else: + # On windows only singleinstance can be trusted + otherinstance = True if iswindows else False + if not otherinstance: return run_gui(opts, args, actions, listener, app) + communicate(args) + return 0 diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index 30457b5deb..151ad1f73b 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -20,8 +20,8 @@ What formats does |app| support conversion to/from? |app| supports the conversion of many input formats to many output formats. It can convert every input format in the following list, to every output format. -*Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, MOBI, ODT, PDF, PRC**, RTF, TXT -*Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PDF, TXT +*Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TXT +*Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TXT ** PRC is a generic format, |app| supports PRC files with TextRead and MOBIBook headers diff --git a/src/calibre/trac/plugins/Changelog.py b/src/calibre/trac/plugins/Changelog.py index 8d02ebdd73..774452c447 100644 --- a/src/calibre/trac/plugins/Changelog.py +++ b/src/calibre/trac/plugins/Changelog.py @@ -1,7 +1,7 @@ ''' Trac Macro to generate an end use Changelog from the svn logs. ''' -import re, collections, time +import re, collections, time, os from bzrlib import log as blog, branch @@ -12,48 +12,55 @@ from trac.wiki.macros import WikiMacroBase from trac.util import Markup -BZR_PATH = '/var/bzr/code/calibre/trunk' +BZR_PATH = '/usr/local/calibre' class ChangelogFormatter(blog.LogFormatter): - + supports_tags = True supports_merge_revisions = False - + _show_advice = False + def __init__(self, num_of_versions=20): self.num_of_versions = num_of_versions self.messages = collections.deque() self.entries = [] - self.current_entry = None - + self.current_entry = None + def log_revision(self, r): if len(self.entries) > self.num_of_versions-1: return msg = r.rev.message match = re.match(r'version\s+(\d+\.\d+.\d+)', msg) - + if match: if self.current_entry is not None: self.entries.append((self.current_entry, set(self.messages))) timestamp = r.rev.timezone + r.rev.timestamp self.current_entry = match.group(1) + time.strftime(' (%d %b, %Y)', time.gmtime(timestamp)) self.messages = collections.deque() - + else: if re.search(r'[a-zA-Z]', msg) and len(msg.strip()) > 5: if 'translation' not in msg and not msg.startswith('IGN'): self.messages.append(msg.strip()) - + def to_wiki_txt(self): txt = ['= Changelog =\n[[PageOutline]]'] for entry in self.entries: txt.append(u'----\n== Version '+entry[0]+' ==') - for msg in entry[1]: - txt.append(u' * ' + msg) - + if entry[0] == '0.6.0': + txt.append(u'For a list of new features in 0.6.0 see http://calibre.kovidgoyal.net/new_in_6') + else: + for msg in entry[1]: + txt.append(u' * ' + msg) + return u'\n'.join(txt) - + def bzr_log_to_txt(): - b = branch.Branch.open(BZR_PATH) + path = BZR_PATH + if not os.path.exists(path): + path = '/home/kovid/work/calibre' + b = branch.Branch.open(path) lf = ChangelogFormatter() blog.show_log(b, lf) return lf.to_wiki_txt() @@ -68,6 +75,6 @@ class ChangeLogMacro(WikiMacroBase): if __name__ == '__main__': - print bzr_log_to_txt() - - + print bzr_log_to_txt().encode('utf-8') + + diff --git a/src/calibre/translations/calibre.pot b/src/calibre/translations/calibre.pot index 457d2db02c..c8b030f776 100644 --- a/src/calibre/translations/calibre.pot +++ b/src/calibre/translations/calibre.pot @@ -4,9 +4,9 @@ # msgid "" msgstr "" -"Project-Id-Version: calibre 0.6.0b14\n" -"POT-Creation-Date: 2009-07-19 12:31+MDT\n" -"PO-Revision-Date: 2009-07-19 12:31+MDT\n" +"Project-Id-Version: calibre 0.6.0b16\n" +"POT-Creation-Date: 2009-07-22 07:39+MDT\n" +"PO-Revision-Date: 2009-07-22 07:39+MDT\n" "Last-Translator: Automatically generated\n" "Language-Team: LANGUAGE\n" "MIME-Version: 1.0\n" @@ -69,8 +69,8 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/reader.py:136 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/reader.py:138 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/transforms/jacket.py:84 -#: /home/kovid/work/calibre/src/calibre/ebooks/pdb/ereader/writer.py:101 -#: /home/kovid/work/calibre/src/calibre/ebooks/pdb/ereader/writer.py:102 +#: /home/kovid/work/calibre/src/calibre/ebooks/pdb/ereader/writer.py:103 +#: /home/kovid/work/calibre/src/calibre/ebooks/pdb/ereader/writer.py:104 #: /home/kovid/work/calibre/src/calibre/ebooks/pdb/input.py:26 #: /home/kovid/work/calibre/src/calibre/ebooks/pdb/palmdoc/writer.py:29 #: /home/kovid/work/calibre/src/calibre/ebooks/pdb/ztxt/writer.py:27 @@ -107,7 +107,7 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/gui2/dialogs/comicconf.py:48 #: /home/kovid/work/calibre/src/calibre/gui2/dialogs/fetch_metadata.py:106 #: /home/kovid/work/calibre/src/calibre/gui2/dialogs/fetch_metadata.py:139 -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:345 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:348 #: /home/kovid/work/calibre/src/calibre/gui2/dialogs/scheduler.py:34 #: /home/kovid/work/calibre/src/calibre/gui2/dialogs/scheduler.py:39 #: /home/kovid/work/calibre/src/calibre/gui2/dialogs/scheduler.py:40 @@ -126,8 +126,8 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/library/database2.py:1430 #: /home/kovid/work/calibre/src/calibre/library/database2.py:1514 #: /home/kovid/work/calibre/src/calibre/library/database2.py:1599 -#: /home/kovid/work/calibre/src/calibre/library/database2.py:1622 -#: /home/kovid/work/calibre/src/calibre/library/database2.py:1673 +#: /home/kovid/work/calibre/src/calibre/library/database2.py:1621 +#: /home/kovid/work/calibre/src/calibre/library/database2.py:1672 #: /home/kovid/work/calibre/src/calibre/library/server.py:294 #: /home/kovid/work/calibre/src/calibre/library/server.py:355 #: /home/kovid/work/calibre/src/calibre/utils/podofo/__init__.py:45 @@ -1384,6 +1384,11 @@ msgid "" "Fetch a cover image for the book identified by ISBN from LibraryThing.com\n" msgstr "" +#: /home/kovid/work/calibre/src/calibre/ebooks/metadata/opf2.py:1053 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1260 +msgid "Cover" +msgstr "" + #: /home/kovid/work/calibre/src/calibre/ebooks/mobi/output.py:22 msgid "Modify images to meet Palm device size limitations." msgstr "" @@ -1405,14 +1410,10 @@ msgstr "" msgid "Disable compression of the file contents." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/mobi/output.py:101 +#: /home/kovid/work/calibre/src/calibre/ebooks/mobi/output.py:103 msgid "All articles" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1260 -msgid "Cover" -msgstr "" - #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1261 msgid "Title Page" msgstr "" @@ -3662,50 +3663,50 @@ msgstr "" msgid "The cover in the %s format is invalid" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:402 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:405 msgid "Downloading cover..." msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:414 -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:419 -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:425 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:417 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:422 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:428 msgid "Cannot fetch cover" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:415 -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:426 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:418 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:429 msgid "Could not fetch cover.
" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:416 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:419 msgid "The download timed out." msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:420 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:423 msgid "Could not find cover for this book. Try specifying the ISBN first." msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:432 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:435 msgid "Bad cover" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:433 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:436 msgid "The cover is not a valid picture" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:472 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:475 msgid "Cannot fetch metadata" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:473 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:476 msgid "You must specify at least one of ISBN, Title, Authors or Publisher" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:499 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:502 msgid "Permission denied" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:500 +#: /home/kovid/work/calibre/src/calibre/gui2/dialogs/metadata_single.py:503 msgid "Could not open %s. Is it being used by another program?" msgstr "" diff --git a/src/calibre/utils/logging.py b/src/calibre/utils/logging.py index b6dd885278..98c7da178e 100644 --- a/src/calibre/utils/logging.py +++ b/src/calibre/utils/logging.py @@ -102,3 +102,5 @@ class Log(object): def __call__(self, *args, **kwargs): self.prints(INFO, *args, **kwargs) + +default_log = Log() diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index cf0e16ecf0..acb1d967b0 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -52,7 +52,7 @@ recipe_modules = ['recipe_' + r for r in ( 'diagonales', 'miradasalsur', 'newsweek_argentina', 'veintitres', 'gva_be', 'hln', 'tijd', 'degentenaar', 'inquirer_net', 'uncrate', 'fastcompany', 'accountancyage', 'laprensa_hn', 'latribuna', - 'eltiempo_hn', + 'eltiempo_hn', 'slate', )] diff --git a/src/calibre/web/feeds/recipes/recipe_al_jazeera.py b/src/calibre/web/feeds/recipes/recipe_al_jazeera.py index 9923f00392..d8aada2356 100644 --- a/src/calibre/web/feeds/recipes/recipe_al_jazeera.py +++ b/src/calibre/web/feeds/recipes/recipe_al_jazeera.py @@ -12,26 +12,27 @@ class AlJazeera(BasicNewsRecipe): title = 'Al Jazeera in English' __author__ = 'Darko Miletic' description = 'News from Middle East' + language = _('English') publisher = 'Al Jazeera' category = 'news, politics, middle east' simultaneous_downloads = 1 - delay = 4 + delay = 4 oldest_article = 1 max_articles_per_feed = 100 no_stylesheets = True encoding = 'iso-8859-1' remove_javascript = True use_embedded_content = False - + html2lrf_options = [ '--comment', description , '--category', category , '--publisher', publisher , '--ignore-tables' ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_table=True' - + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_table=True' + keep_only_tags = [dict(name='div', attrs={'id':'ctl00_divContent'})] remove_tags = [ diff --git a/src/calibre/web/feeds/recipes/recipe_azstarnet.py b/src/calibre/web/feeds/recipes/recipe_azstarnet.py index 55cfcf78f1..c1652eb10e 100644 --- a/src/calibre/web/feeds/recipes/recipe_azstarnet.py +++ b/src/calibre/web/feeds/recipes/recipe_azstarnet.py @@ -12,9 +12,10 @@ class Azstarnet(BasicNewsRecipe): title = 'Arizona Daily Star' __author__ = 'Darko Miletic' description = 'news from Arizona' + language = _('English') publisher = 'azstarnet.com' category = 'news, politics, Arizona, USA' - delay = 1 + delay = 1 oldest_article = 1 max_articles_per_feed = 100 no_stylesheets = True @@ -28,8 +29,8 @@ class Azstarnet(BasicNewsRecipe): , '--category', category , '--publisher', publisher ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' def get_browser(self): br = BasicNewsRecipe.get_browser() @@ -40,8 +41,8 @@ class Azstarnet(BasicNewsRecipe): br['pass' ] = self.password br.submit() return br - - + + keep_only_tags = [dict(name='div', attrs={'id':'storycontent'})] remove_tags = [ @@ -49,15 +50,15 @@ class Azstarnet(BasicNewsRecipe): ,dict(name='div',attrs={'class':'bannerinstory'}) ] - + feeds = [(u'Tucson Region', u'http://rss.azstarnet.com/index.php?site=metro')] def preprocess_html(self, soup): soup.html['dir' ] = 'ltr' soup.html['lang'] = 'en-US' mtag = '\n\n\n' - soup.head.insert(0,mtag) + soup.head.insert(0,mtag) for item in soup.findAll(style=True): - del item['style'] + del item['style'] return soup - \ No newline at end of file + diff --git a/src/calibre/web/feeds/recipes/recipe_coding_horror.py b/src/calibre/web/feeds/recipes/recipe_coding_horror.py index 2caa4f2280..edc671f6ef 100644 --- a/src/calibre/web/feeds/recipes/recipe_coding_horror.py +++ b/src/calibre/web/feeds/recipes/recipe_coding_horror.py @@ -14,6 +14,7 @@ class CodingHorror(BasicNewsRecipe): description = 'programming and human factors - Jeff Atwood' category = 'blog, programming' publisher = 'Jeff Atwood' + language = _('English') author = 'Jeff Atwood' oldest_article = 30 max_articles_per_feed = 100 diff --git a/src/calibre/web/feeds/recipes/recipe_linuxdevices.py b/src/calibre/web/feeds/recipes/recipe_linuxdevices.py index f3006e427e..5f2ef3529b 100644 --- a/src/calibre/web/feeds/recipes/recipe_linuxdevices.py +++ b/src/calibre/web/feeds/recipes/recipe_linuxdevices.py @@ -16,6 +16,7 @@ class Sueddeutsche(BasicNewsRecipe): use_embedded_content = False timefmt = ' [%a %d %b %Y]' max_articles_per_feed = 50 + language = _('English') no_stylesheets = True html2epub_options = 'linearize_tables = True\nbase_font_size2=14' html2lrf_options = ['--ignore-tables'] diff --git a/src/calibre/web/feeds/recipes/recipe_moneynews.py b/src/calibre/web/feeds/recipes/recipe_moneynews.py index 96656e490d..46f494f27e 100644 --- a/src/calibre/web/feeds/recipes/recipe_moneynews.py +++ b/src/calibre/web/feeds/recipes/recipe_moneynews.py @@ -11,25 +11,26 @@ from calibre.web.feeds.news import BasicNewsRecipe class MoneyNews(BasicNewsRecipe): title = 'Moneynews.com' __author__ = 'Darko Miletic' - description = 'Financial news worldwide' + description = 'Financial news worldwide' publisher = 'moneynews.com' - category = 'news, finances, USA, business' + language = _('English') + category = 'news, finances, USA, business' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'cp1252' - + html2lrf_options = [ '--comment', description , '--category', category , '--publisher', publisher , '--ignore-tables' ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' - - feeds = [ + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + + feeds = [ (u'Street Talk' , u'http://moneynews.newsmax.com/xml/streettalk.xml' ) ,(u'Finance News' , u'http://moneynews.newsmax.com/xml/FinanceNews.xml' ) ,(u'Economy' , u'http://moneynews.newsmax.com/xml/economy.xml' ) @@ -38,12 +39,12 @@ class MoneyNews(BasicNewsRecipe): ,(u'Investing & Analysis' , u'http://moneynews.newsmax.com/xml/investing.xml' ) ] - + keep_only_tags = [dict(name='table', attrs={'class':'copy'})] - + remove_tags = [ dict(name='td' , attrs={'id':'article_fontsize'}) ,dict(name='table', attrs={'id':'toolbox' }) ,dict(name='tr' , attrs={'id':'noprint3' }) ] - \ No newline at end of file + diff --git a/src/calibre/web/feeds/recipes/recipe_publico.py b/src/calibre/web/feeds/recipes/recipe_publico.py index 17e168955f..34c89ccb6c 100644 --- a/src/calibre/web/feeds/recipes/recipe_publico.py +++ b/src/calibre/web/feeds/recipes/recipe_publico.py @@ -17,7 +17,7 @@ class Publico(BasicNewsRecipe): max_articles_per_feed = 30 encoding='utf-8' no_stylesheets = True - language = _('Portuguese') + language = _('Portugese') preprocess_regexps = [(re.compile(u"\uFFFD", re.DOTALL|re.IGNORECASE), lambda match: ''),] feeds = [ diff --git a/src/calibre/web/feeds/recipes/recipe_scott_hanselman.py b/src/calibre/web/feeds/recipes/recipe_scott_hanselman.py index a97946a018..968868cf8a 100644 --- a/src/calibre/web/feeds/recipes/recipe_scott_hanselman.py +++ b/src/calibre/web/feeds/recipes/recipe_scott_hanselman.py @@ -15,6 +15,7 @@ class ScottHanselman(BasicNewsRecipe): category = "Scott, Computer, Zen, .NET, C#, Hanselman, Scott, Weblog, Diabetes, Portland, Zimbabwe, ComputerZen.com - Scott Hanselman's Musings" publisher = 'Scott Hanselman' author = 'Scott Hanselman' + language = _('English') oldest_article = 30 max_articles_per_feed = 100 no_stylesheets = True diff --git a/src/calibre/web/feeds/recipes/recipe_slate.py b/src/calibre/web/feeds/recipes/recipe_slate.py new file mode 100644 index 0000000000..dae94573b0 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_slate.py @@ -0,0 +1,330 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal ' +''' +Fetches the last 7 days of featured articles from slate.com +''' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag + +class Slate(BasicNewsRecipe): + # Method variables for customizing downloads + title = 'Slate' + description = 'A daily magazine on the Web, offering analysis and commentary about politics, news and culture.' + __author__ = 'GRiker@hotmail.com' + language = _('English') + max_articles_per_feed = 40 + oldest_article = 7.0 + recursions = 0 + delay = 0 + simultaneous_downloads = 5 + timeout = 120.0 + timefmt = '' + feeds = None + no_stylesheets = True + encoding = None + + # Method variables for customizing feed parsing + summary_length = 250 + use_embedded_content = None + + # Method variables for pre/post processing of HTML + remove_tags = [ dict(name=['link','style']), + dict(id=['toolbox','site_navigation','article_bottom_tools_cntr', + 'article_bottom_tools','recommend_tab2','bottom_sponsored_links', + 'fray_article_discussion','bizbox_sponsored_links_bottom', + 'page_rightcol','top_banner','also_in_slate_bottom','articlefooter', + 'article_top_wedge','content-top','page-title', + 'block-today039s-business-press-archives','block-blog-roll', + 'block-also-in-tbm','block-most-popular-on-tbm','block-the-best-of-tbm', + 'service-links-bottom','comments','ft']), + dict(attrs={'class':['fray_article_links','clearing','nav', + 'service-links service-links-stack','yui-b last', + 'read-more-comments']})] + extra_css = '.headline {text-align:left;}\n\ + .byline {font:monospace; text-align:left; margin-bottom:0pt;}\n\ + .dateline {text-align:left; height:0pt;}\n\ + .source {align:left;}\n\ + .credit {text-align:right;font-size:smaller;}\n' + + baseURL = 'http://slate.com' + section_dates = [] + + def tag_to_strings(self, tag): + if not tag: + return '' + if isinstance(tag, basestring): + return tag + strings = [] + for item in tag.contents: + if isinstance(item, (NavigableString, CData)): + strings.append(item.string) + elif isinstance(item, Tag): + res = self.tag_to_string(item) + if res: + strings.append(res) + return strings + + def extract_sections(self): + soup = self.index_to_soup( self.baseURL ) + + soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'}) + soup = soup.find(True, attrs={'id':'toc_links_container'}) + + todays_section = soup.find(True, attrs={'class':'todaydateline'}) + self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) + self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) + + older_section_dates = soup.findAll(True, attrs={'class':'maindateline'}) + for older_section in older_section_dates : + self.section_dates.append(self.tag_to_string(older_section,use_alt=False)) + + headline_stories = soup_top_stories.find('ul') + section_lists = soup.findAll('ul') + # Prepend the headlines to the first section + section_lists[0].insert(0,headline_stories) + + sections = [] + for section in section_lists : + sections.append(section) + + return sections + + + def extract_section_articles(self, sections_html) : + soup = self.index_to_soup(str(sections_html)) + sections = soup.findAll('ul') + articles = {} + key = None + ans = [] + + for (i,section) in enumerate(sections) : + + # Get the section name + if section.has_key('id') : + key = self.section_dates[i] + articles[key] = [] + ans.append(key) + else : + continue + + # Get the section article_list + article_list = section.findAll('li') + + excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast'] + excludedTitleKeywords = ['Gabfest','Slate V'] + excludedAuthorKeywords = ['Prudence'] + + # Extract the article attributes + for article in article_list : + bylines = self.tag_to_strings(article) + url = article.a['href'] + title = bylines[0] + full_title = self.tag_to_string(article) + + author = None + description = None + pubdate = None + + if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 : + description = "A summary of what's in the major U.S. newspapers." + + if len(bylines) == 3 : + author = bylines[2].strip() + author = re.sub('[\r][\n][\t][\t\t]','', author) + author = re.sub(',','', author) + if bylines[1] is not None : + description = bylines[1] + full_byline = self.tag_to_string(article) + if full_byline.find('major U.S. newspapers') > 0 : + description = "A summary of what's in the major U.S. newspapers." + + + if len(bylines) > 3 and author is not None: + author += " | " + for (i,substring) in enumerate(bylines[3:]) : + #print "substring: %s" % substring.encode('cp1252') + author += substring.strip() + if i < len(bylines[3:]) : + author += " | " + + # Skip articles whose descriptions contain excluded keywords + if description is not None : + excluded = re.compile('|'.join(excludedDescriptionKeywords)) + found_excluded = excluded.search(description) + if found_excluded : + continue + + # Skip articles whose title contain excluded keywords + if full_title is not None : + excluded = re.compile('|'.join(excludedTitleKeywords)) + #self.log("evaluating full_title: %s" % full_title) + found_excluded = excluded.search(full_title) + if found_excluded : + continue + + # Skip articles whose author contain excluded keywords + if author is not None : + excluded = re.compile('|'.join(excludedAuthorKeywords)) + found_excluded = excluded.search(author) + if found_excluded : + continue + + skip_this_article = False + # Check to make sure we're not adding a duplicate + for article in articles[key] : + if article['url'] == url : + skip_this_article = True + break + + if skip_this_article : + continue + + # Build the dictionary entry for this article + feed = key + if not articles.has_key(feed) : + articles[feed] = [] + articles[feed].append(dict(title=title, url=url, date=pubdate, description=description, + author=author, content='')) + # Promote 'newspapers' to top + for (i,article) in enumerate(articles[feed]) : + if article['description'] is not None : + if article['description'].find('newspapers') > 0 : + articles[feed].insert(0,articles[feed].pop(i)) + + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + ans = self.remove_duplicates(ans) + return ans + + def flatten_document(self, ans): + flat_articles = [] + for (i,section) in enumerate(ans) : + for article in section[1] : + flat_articles.append(article) + flat_section = ['All Articles', flat_articles] + flat_ans = [flat_section] + + return flat_ans + + def remove_duplicates(self, ans): + for (i,section) in enumerate(ans) : + for article in section[1] : + for (j,subsequent_section) in enumerate(ans[i+1:]) : + for (k,subsequent_article) in enumerate(subsequent_section[1]) : + if article['url'] == subsequent_article['url'] : + del subsequent_section[1][k] + return ans + + def print_version(self, url) : + return url + 'pagenum/all/' + + # Class methods + def parse_index(self) : + sections = self.extract_sections() + section_list = self.extract_section_articles(sections) + section_list = self.flatten_document(section_list) + return section_list + + + def postprocess_html(self, soup, first_fetch) : + # Fix up dept_kicker as

+ dept_kicker = soup.find(True, attrs={'class':'department_kicker'}) + if dept_kicker is not None : + kicker_strings = self.tag_to_strings(dept_kicker) + kicker = kicker_strings[2] + kicker_strings[3] + kicker = re.sub('.','',kicker) + h3Tag = Tag(soup, "h3") + emTag = Tag(soup, "em") + h3Tag.insert(0, emTag) + emTag.insert(0,kicker) + dept_kicker.replaceWith(h3Tag) + + # Change

to

+ headline = soup.find("h1") + if headline is not None : + h2tag = Tag(soup, "h2") + h2tag['class'] = "headline" + strs = self.tag_to_strings(headline) + result = '' + for (i,substr) in enumerate(strs) : + result += substr + if i < len(strs) -1 : + result += '
' + h2tag.insert(0, result) + headline.replaceWith(h2tag) + + # Fix up the concatenated byline and dateline + byline = soup.find(True,attrs={'class':'byline'}) + if byline is not None : + bylineTag = Tag(soup,'div') + bylineTag['class'] = 'byline' + bylineTag.insert(0,self.tag_to_string(byline)) + byline.replaceWith(bylineTag) + + dateline = soup.find(True, attrs={'class':'dateline'}) + if dateline is not None : + datelineTag = Tag(soup, 'div') + datelineTag['class'] = 'dateline' + datelineTag.insert(0,self.tag_to_string(dateline)) + dateline.replaceWith(datelineTag) + + # Change captions to italic, add
+ for caption in soup.findAll(True, {'class':'caption'}) : + if caption is not None: + emTag = Tag(soup, "em") + emTag.insert(0, '
' + self.tag_to_string(caption)) + hrTag = Tag(soup, 'hr') + emTag.insert(1, hrTag) + caption.replaceWith(emTag) + + return soup + + def postprocess_book(self, oeb, opts, log) : + + def extract_byline(href) : + soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) + byline = soup.find(True,attrs={'class':'byline'}) + if byline is not None: + return self.tag_to_string(byline,use_alt=False) + else : + return None + + def extract_description(href) : + soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) + paragraphs = soup.findAll('p') + for p in paragraphs : + if self.tag_to_string(p,use_alt=False).startswith('By ') or \ + self.tag_to_string(p,use_alt=False).startswith('Posted '): + continue + + images = p.findAll(True, attrs={'class':'imagewrapper'}) + for image in images : + image.extract() + return self.tag_to_string(p,use_alt=False)[:200] + '...' + + return None + + if oeb.toc.depth() == 2 : + for article in oeb.toc : + if article.author is None : + article.author = extract_byline(article.href) + + if article.description is None : + article.description = extract_description(article.href) + + + elif oeb.toc.depth() == 3 : + for section in oeb.toc : + for article in section : + if article.author is None : + article.author = extract_byline(article.href) + + if article.description is None : + article.description = extract_description(article.href) + + + diff --git a/src/calibre/web/feeds/recipes/recipe_stackoverflow.py b/src/calibre/web/feeds/recipes/recipe_stackoverflow.py index 53f4642f9e..bec58e0e20 100644 --- a/src/calibre/web/feeds/recipes/recipe_stackoverflow.py +++ b/src/calibre/web/feeds/recipes/recipe_stackoverflow.py @@ -15,6 +15,7 @@ class StackOverflowBlog(BasicNewsRecipe): category = 'blog, programming' publisher = 'StackOverflow team' oldest_article = 30 + language = _('English') max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = True diff --git a/src/calibre/web/feeds/recipes/recipe_wired.py b/src/calibre/web/feeds/recipes/recipe_wired.py index 101adca7b0..e49153cd27 100644 --- a/src/calibre/web/feeds/recipes/recipe_wired.py +++ b/src/calibre/web/feeds/recipes/recipe_wired.py @@ -2,36 +2,25 @@ __license__ = 'GPL v3' __docformat__ = 'restructuredtext en' -import re from calibre.web.feeds.news import BasicNewsRecipe class Wired(BasicNewsRecipe): - + title = 'Wired.com' - __author__ = 'David Chen DaveChenorg>' + __author__ = 'Kovid Goyal' description = 'Technology news' timefmt = ' [%Y%b%d %H%M]' language = _('English') no_stylesheets = True - #html2lrf_options = ['--base-font-size', '16'] - - preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in - - [ - - ## Remove any banners/links/ads/cruft before the body of the article. - (r')|(