From d3814a3a5a4ecd5e26270f143ff40bd67338dcc1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 18:50:26 -0600 Subject: [PATCH 01/19] /browse: Make the top level page more semantic and pentadactyl firendly --- resources/content_server/browse/browse.css | 2 ++ resources/content_server/browse/browse.js | 2 +- setup/server.py | 8 +++++++- src/calibre/library/server/browse.py | 5 +++-- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/resources/content_server/browse/browse.css b/resources/content_server/browse/browse.css index 92ed4c3ce6..9a2125c0c0 100644 --- a/resources/content_server/browse/browse.css +++ b/resources/content_server/browse/browse.css @@ -208,6 +208,8 @@ h2.library_name { } +.toplevel li a { text-decoration: none; } + .toplevel li img { vertical-align: middle; margin-right: 1em; diff --git a/resources/content_server/browse/browse.js b/resources/content_server/browse/browse.js index 89ce679871..db4e602449 100644 --- a/resources/content_server/browse/browse.js +++ b/resources/content_server/browse/browse.js @@ -116,7 +116,7 @@ function toplevel() { $(".sort_select").hide(); $(".toplevel li").click(function() { - var href = $(this).children("span.url").text(); + var href = $(this).children("a").attr('href'); window.location = href; }); diff --git a/setup/server.py b/setup/server.py index 2103f4805a..d9c444fa55 100644 --- a/setup/server.py +++ b/setup/server.py @@ -24,6 +24,10 @@ class Server(Command): self.rebuild_monocole() p = subprocess.Popen(['calibre-server', '--develop'], stderr=subprocess.STDOUT, stdout=log) + time.sleep(0.2) + if p.poll() is not None: + print 'Starting server failed' + raise SystemExit(1) return p def run(self, opts): @@ -38,9 +42,11 @@ class Server(Command): try: raw_input('Press Enter to kill/restart server. Ctrl+C to quit: ') except: + if p.poll() is None: + p.kill() break else: - while p.returncode is None: + while p.poll() is None: p.terminate() time.sleep(0.1) p.kill() diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py index 9c442acc11..935d472cb1 100644 --- a/src/calibre/library/server/browse.py +++ b/src/calibre/library/server/browse.py @@ -335,9 +335,10 @@ class BrowseServer(object): icon = 'blank.png' cats.append((meta['name'], category, icon)) - cats = [('
  • {0}' + cats = [('
  •  ' + '{0}' '{0}' - '{3}/browse/category/{1}
  • ') + '') .format(xml(x, True), xml(quote(y)), xml(_('Browse books by')), self.opts.url_prefix, src='/browse/icon/'+z) for x, y, z in cats] From 39e102e3f84a76f54a43f1ab2e476bd1089e5a64 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 19:25:50 -0600 Subject: [PATCH 02/19] /browse: Make category listing also more semantic and pentadactyl friendly --- resources/content_server/browse/browse.css | 11 +++++++++-- resources/content_server/browse/browse.js | 4 ++-- resources/content_server/{ => read}/monocle.js | 0 setup/server.py | 2 +- src/calibre/library/server/browse.py | 16 +++++++++------- 5 files changed, 21 insertions(+), 12 deletions(-) rename resources/content_server/{ => read}/monocle.js (100%) diff --git a/resources/content_server/browse/browse.css b/resources/content_server/browse/browse.css index 9a2125c0c0..1243795e55 100644 --- a/resources/content_server/browse/browse.css +++ b/resources/content_server/browse/browse.css @@ -263,9 +263,16 @@ h2.library_name { } -.category div.category-item span.href { display: none } +.category div.category-item a { text-decoration: none; color: inherit; } -#groups span.load_href { display: none } +#groups a.load_href { + text-decoration: none; + color: inherit; + font-size: medium; + font-weight: normal; + padding: 0; + padding-left: 0.5em; +} #groups h3 { font-weight: bold; diff --git a/resources/content_server/browse/browse.js b/resources/content_server/browse/browse.js index db4e602449..e0585a9afd 100644 --- a/resources/content_server/browse/browse.js +++ b/resources/content_server/browse/browse.js @@ -133,7 +133,7 @@ function render_error(msg) { // Category feed {{{ function category_clicked() { - var href = $(this).find("span.href").html(); + var href = $(this).find("a").attr('href'); window.location = href; } @@ -151,7 +151,7 @@ function category() { change: function(event, ui) { if (ui.newContent) { - var href = ui.newContent.children("span.load_href").html(); + var href = ui.newContent.prev().children("a.load_href").attr('href'); ui.newContent.children(".loading").show(); if (href) { $.ajax({ diff --git a/resources/content_server/monocle.js b/resources/content_server/read/monocle.js similarity index 100% rename from resources/content_server/monocle.js rename to resources/content_server/read/monocle.js diff --git a/setup/server.py b/setup/server.py index d9c444fa55..443ffb7da9 100644 --- a/setup/server.py +++ b/setup/server.py @@ -18,7 +18,7 @@ class Server(Command): def rebuild_monocole(self): subprocess.check_call(['sprocketize', '-C', self.MONOCLE_PATH, '-I', 'src', 'src/monocle.js'], - stdout=open('resources/content_server/monocle.js', 'wb')) + stdout=open('resources/content_server/read/monocle.js', 'wb')) def launch_server(self, log): self.rebuild_monocole() diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py index 935d472cb1..7131ead77f 100644 --- a/src/calibre/library/server/browse.py +++ b/src/calibre/library/server/browse.py @@ -123,9 +123,10 @@ def get_category_items(category, items, restriction, datatype, prefix): # {{{ def item(i): templ = (u'
    ' - '
    {0}
    {1}
    ' - '
    {2}' - '{5}{3}
    ') + '
    ' + '{0}
    ' + '
    {1}
    ' + '
    {2}
    ') rating, rstring = render_rating(i.avg_rating, prefix) name = xml(i.name) if datatype == 'rating': @@ -142,7 +143,7 @@ def get_category_items(category, items, restriction, datatype, prefix): # {{{ q = category href = '/browse/matches/%s/%s'%(quote(q), quote(id_)) return templ.format(xml(name), rating, - xml(desc), xml(href), rstring, prefix) + xml(desc), xml(href, True), rstring, prefix) items = list(map(item, items)) return '\n'.join(['
    '] + items + ['
    ']) @@ -394,14 +395,15 @@ class BrowseServer(object): for x in sorted(starts): category_groups[x] = len([y for y in items if getter(y).upper().startswith(x)]) - items = [(u'

    {0} [{2}]

    ' + items = [(u'

    {0} [{2}]

    ' u'' u'
    {1}{1}
    ' - u'{4}{3}
    ').format( + u'
    ').format( xml(s, True), xml(_('Loading, please wait'))+'…', unicode(c), - xml(u'/browse/category_group/%s/%s'%(category, s)), + xml(u'/browse/category_group/%s/%s'%(category, s), True), self.opts.url_prefix) for s, c in category_groups.items()] items = '\n\n'.join(items) From 20e015ed748761ffc331e92aa9e998eff5d8622f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 22:40:13 -0600 Subject: [PATCH 03/19] Content server: Handle books with # in their title/authors correctly. Fixes #7354 (Having problems with calibre server) --- setup/server.py | 69 ++++++++++++++++++++++----- src/calibre/library/server/browse.py | 3 +- src/calibre/library/server/content.py | 4 +- src/calibre/library/server/mobile.py | 5 +- 4 files changed, 63 insertions(+), 18 deletions(-) diff --git a/setup/server.py b/setup/server.py index 443ffb7da9..c48294ac70 100644 --- a/setup/server.py +++ b/setup/server.py @@ -5,10 +5,37 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import subprocess, tempfile, os, time +import subprocess, tempfile, os, time, sys from setup import Command +try: + from pyinotify import WatchManager, ThreadedNotifier, EventsCodes, ProcessEvent +except: + wm = None +else: + wm = WatchManager() + flags = EventsCodes.ALL_FLAGS + mask = flags['IN_MODIFY'] + + class ProcessEvents(ProcessEvent): + + def __init__(self, command): + ProcessEvent.__init__(self) + self.command = command + + def process_default(self, event): + name = getattr(event, + 'name', None) + if name and os.path.splitext(name)[1].startswith('.py'): + print + print name, 'changed' + self.command.kill_server() + self.command.launch_server() + print self.command.prompt, + sys.stdout.flush() + + class Server(Command): description = 'Run the calibre server in development mode conveniently' @@ -20,35 +47,51 @@ class Server(Command): '-I', 'src', 'src/monocle.js'], stdout=open('resources/content_server/read/monocle.js', 'wb')) - def launch_server(self, log): + def launch_server(self): + print 'Starting server...\n' self.rebuild_monocole() p = subprocess.Popen(['calibre-server', '--develop'], - stderr=subprocess.STDOUT, stdout=log) + stderr=subprocess.STDOUT, stdout=self.server_log) time.sleep(0.2) if p.poll() is not None: print 'Starting server failed' raise SystemExit(1) return p + def kill_server(self): + while self.server_proc.poll() is None: + self.server_proc.terminate() + time.sleep(0.1) + self.server_proc.kill() + + def watch(self): + if wm is not None: + self.notifier = ThreadedNotifier(wm, ProcessEvents(self)) + self.notifier.start() + self.wdd = wm.add_watch(os.path.abspath('src'), mask, rec=True) + def run(self, opts): tdir = tempfile.gettempdir() logf = os.path.join(tdir, 'calibre-server.log') - log = open(logf, 'ab') + self.server_log = open(logf, 'ab') + self.prompt = 'Press Enter to kill/restart server. Ctrl+C to quit: ' print 'Server log available at:', logf + print + self.server_proc = None + self.watch() while True: - print 'Starting server...' - p = self.launch_server(log) + self.server_proc = self.launch_server() try: - raw_input('Press Enter to kill/restart server. Ctrl+C to quit: ') + raw_input(self.prompt) except: - if p.poll() is None: - p.kill() + if self.server_proc.poll() is None: + self.server_proc.kill() break else: - while p.poll() is None: - p.terminate() - time.sleep(0.1) - p.kill() + self.kill_server() print + if hasattr(self, 'notifier'): + self.notifier.stop() + diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py index 7131ead77f..709d872ba2 100644 --- a/src/calibre/library/server/browse.py +++ b/src/calibre/library/server/browse.py @@ -566,7 +566,8 @@ class BrowseServer(object): if not val: val = '' args[key] = xml(val, True) - fname = ascii_filename(args['title']) + ' - ' + ascii_filename(args['authors']) + fname = quote(ascii_filename(args['title']) + ' - ' + + ascii_filename(args['authors'])) return args, fmt, fmts, fname @Endpoint(mimetype='application/json; charset=utf-8') diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index 670c31b9df..6437f02cb6 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -70,10 +70,10 @@ class ContentServer(object): id = id.rpartition('_')[-1].partition('.')[0] match = re.search(r'\d+', id) if not match: - raise cherrypy.HTTPError(400, 'id:%s not an integer'%id) + raise cherrypy.HTTPError(404, 'id:%s not an integer'%id) id = int(match.group()) if not self.db.has_id(id): - raise cherrypy.HTTPError(400, 'id:%d does not exist in database'%id) + raise cherrypy.HTTPError(404, 'id:%d does not exist in database'%id) if what == 'thumb' or what.startswith('thumb_'): try: width, height = map(int, what.split('_')[1:]) diff --git a/src/calibre/library/server/mobile.py b/src/calibre/library/server/mobile.py index a889089109..d66e6d842f 100644 --- a/src/calibre/library/server/mobile.py +++ b/src/calibre/library/server/mobile.py @@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en' import re, os import __builtin__ +from urllib import quote import cherrypy from lxml import html @@ -115,8 +116,8 @@ def build_index(books, num, search, sort, order, start, total, url_base, CKEYS, data = TD() for fmt in book['formats'].split(','): - a = ascii_filename(book['authors']) - t = ascii_filename(book['title']) + a = quote(ascii_filename(book['authors'])) + t = quote(ascii_filename(book['title'])) s = SPAN( A( fmt.lower(), From b597410d84a084da3f7ca7566ec50c62ea6cb812 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 23:25:30 -0600 Subject: [PATCH 04/19] calibre-server: Make auto reload control separate from --devlop with a new command line option --auto-reload --- setup/server.py | 35 +++++++++++++++++------------- src/calibre/library/server/base.py | 21 +++++++++--------- src/calibre/library/server/main.py | 3 +++ 3 files changed, 34 insertions(+), 25 deletions(-) diff --git a/setup/server.py b/setup/server.py index c48294ac70..98a9e8fa90 100644 --- a/setup/server.py +++ b/setup/server.py @@ -6,6 +6,7 @@ __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' import subprocess, tempfile, os, time, sys +from threading import RLock from setup import Command @@ -27,10 +28,11 @@ else: def process_default(self, event): name = getattr(event, 'name', None) - if name and os.path.splitext(name)[1].startswith('.py'): + if name and os.path.splitext(name)[1] == '.py': print print name, 'changed' self.command.kill_server() + time.sleep(0.1) self.command.launch_server() print self.command.prompt, sys.stdout.flush() @@ -49,20 +51,23 @@ class Server(Command): def launch_server(self): print 'Starting server...\n' - self.rebuild_monocole() - p = subprocess.Popen(['calibre-server', '--develop'], - stderr=subprocess.STDOUT, stdout=self.server_log) - time.sleep(0.2) - if p.poll() is not None: - print 'Starting server failed' - raise SystemExit(1) - return p + with self.lock: + self.rebuild_monocole() + p = subprocess.Popen(['calibre-server', '--develop'], + stderr=subprocess.STDOUT, stdout=self.server_log) + time.sleep(0.2) + if p.poll() is not None: + print 'Starting server failed' + raise SystemExit(1) + return p def kill_server(self): - while self.server_proc.poll() is None: - self.server_proc.terminate() - time.sleep(0.1) - self.server_proc.kill() + print 'Killing server...\n' + with self.lock: + if self.server_proc.poll() is None: + self.server_proc.terminate() + while self.server_proc.poll() is None: + time.sleep(0.1) def watch(self): if wm is not None: @@ -71,6 +76,7 @@ class Server(Command): self.wdd = wm.add_watch(os.path.abspath('src'), mask, rec=True) def run(self, opts): + self.lock = RLock() tdir = tempfile.gettempdir() logf = os.path.join(tdir, 'calibre-server.log') self.server_log = open(logf, 'ab') @@ -85,8 +91,7 @@ class Server(Command): try: raw_input(self.prompt) except: - if self.server_proc.poll() is None: - self.server_proc.kill() + self.kill_server() break else: self.kill_server() diff --git a/src/calibre/library/server/base.py b/src/calibre/library/server/base.py index c9025a28f8..29636c5659 100644 --- a/src/calibre/library/server/base.py +++ b/src/calibre/library/server/base.py @@ -118,16 +118,17 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache, self.set_database(db) cherrypy.config.update({ - 'log.screen' : opts.develop, - 'engine.autoreload_on' : opts.develop, - 'tools.log_headers.on' : opts.develop, - 'checker.on' : opts.develop, - 'request.show_tracebacks': show_tracebacks, - 'server.socket_host' : listen_on, - 'server.socket_port' : opts.port, - 'server.socket_timeout' : opts.timeout, #seconds - 'server.thread_pool' : opts.thread_pool, # number of threads - }) + 'log.screen' : opts.develop, + 'engine.autoreload_on' : getattr(opts, + 'auto_reload', False), + 'tools.log_headers.on' : opts.develop, + 'checker.on' : opts.develop, + 'request.show_tracebacks': show_tracebacks, + 'server.socket_host' : listen_on, + 'server.socket_port' : opts.port, + 'server.socket_timeout' : opts.timeout, #seconds + 'server.thread_pool' : opts.thread_pool, # number of threads + }) if embedded or wsgi: cherrypy.config.update({'engine.SIGHUP' : None, 'engine.SIGTERM' : None,}) diff --git a/src/calibre/library/server/main.py b/src/calibre/library/server/main.py index fbd811a1ab..b7cb3ecf12 100644 --- a/src/calibre/library/server/main.py +++ b/src/calibre/library/server/main.py @@ -58,6 +58,9 @@ The OPDS interface is advertised via BonJour automatically. help=_('Specifies a restriction to be used for this invocation. ' 'This option overrides any per-library settings specified' ' in the GUI')) + parser.add_option('--auto-reload', default=False, action='store_true', + help=_('Auto reload server when source code changes. May not' + ' work in all environments.')) return parser From 32b21d78efc8b7d582702b0376b13b707579c116 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 23:42:35 -0600 Subject: [PATCH 05/19] ... --- setup/server.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/setup/server.py b/setup/server.py index 98a9e8fa90..276a606fc4 100644 --- a/setup/server.py +++ b/setup/server.py @@ -32,7 +32,6 @@ else: print print name, 'changed' self.command.kill_server() - time.sleep(0.1) self.command.launch_server() print self.command.prompt, sys.stdout.flush() @@ -53,7 +52,7 @@ class Server(Command): print 'Starting server...\n' with self.lock: self.rebuild_monocole() - p = subprocess.Popen(['calibre-server', '--develop'], + self.server_proc = p = subprocess.Popen(['calibre-server', '--develop'], stderr=subprocess.STDOUT, stdout=self.server_log) time.sleep(0.2) if p.poll() is not None: @@ -63,11 +62,12 @@ class Server(Command): def kill_server(self): print 'Killing server...\n' - with self.lock: - if self.server_proc.poll() is None: - self.server_proc.terminate() - while self.server_proc.poll() is None: - time.sleep(0.1) + if self.server_proc is not None: + with self.lock: + if self.server_proc.poll() is None: + self.server_proc.terminate() + while self.server_proc.poll() is None: + time.sleep(0.1) def watch(self): if wm is not None: @@ -83,14 +83,14 @@ class Server(Command): self.prompt = 'Press Enter to kill/restart server. Ctrl+C to quit: ' print 'Server log available at:', logf print - self.server_proc = None self.watch() while True: - self.server_proc = self.launch_server() + self.launch_server() try: raw_input(self.prompt) except: + print self.kill_server() break else: From b4c3bcf9179c0d3f3b3085be48cd8608aea05ba6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 09:12:40 -0600 Subject: [PATCH 06/19] Fix #7356 (Error "'unicode' object has no attribute 'isoformat'" when copying book to iTunes) --- src/calibre/devices/apple/driver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/apple/driver.py b/src/calibre/devices/apple/driver.py index 9ad3cf3e08..74fa868255 100644 --- a/src/calibre/devices/apple/driver.py +++ b/src/calibre/devices/apple/driver.py @@ -19,7 +19,7 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.epub import set_metadata from calibre.library.server.utils import strftime from calibre.utils.config import config_dir, prefs -from calibre.utils.date import isoformat, now, parse_date +from calibre.utils.date import now, parse_date from calibre.utils.logging import Log from calibre.utils.zipfile import ZipFile @@ -2521,11 +2521,11 @@ class ITUNES(DriverBase): metadata.timestamp = datetime.datetime(old_ts.year, old_ts.month, old_ts.day, old_ts.hour, old_ts.minute, old_ts.second, old_ts.microsecond+1, old_ts.tzinfo) else: - metadata.timestamp = isoformat(now()) + metadata.timestamp = now() if DEBUG: self.log.info(" add timestamp: %s" % metadata.timestamp) else: - metadata.timestamp = isoformat(now()) + metadata.timestamp = now() if DEBUG: self.log.warning(" missing block in OPF file") self.log.info(" add timestamp: %s" % metadata.timestamp) From 9c85c1b273df3226341215a1b422e4bf1fc583d7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 09:28:37 -0600 Subject: [PATCH 07/19] Content server: Fix bug that caused errors on systems that do not use UTF-8 encoding --- src/calibre/library/server/browse.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py index 709d872ba2..9530a34c73 100644 --- a/src/calibre/library/server/browse.py +++ b/src/calibre/library/server/browse.py @@ -253,8 +253,6 @@ class BrowseServer(object): lp = self.db.library_path if isbytestring(lp): lp = force_unicode(lp, filesystem_encoding) - if isinstance(ans, unicode): - ans = ans.encode('utf-8') ans = ans.replace('{library_name}', xml(os.path.basename(lp))) ans = ans.replace('{library_path}', xml(lp, True)) ans = ans.replace('{initial_search}', initial_search) From a149cba9ebf201d010a74259d9bfc78b1953b2fb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 11:16:22 -0600 Subject: [PATCH 08/19] Fix #7357 (Support for Digma Q600) --- setup/server.py | 5 +++++ src/calibre/customize/builtins.py | 3 ++- src/calibre/devices/misc.py | 9 +++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/setup/server.py b/setup/server.py index 276a606fc4..66cb6adf7b 100644 --- a/setup/server.py +++ b/setup/server.py @@ -85,8 +85,13 @@ class Server(Command): print self.watch() + first = True while True: self.launch_server() + if first: + pass + first = False + try: raw_input(self.prompt) except: diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 2945cc6604..3cc84f248d 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -475,7 +475,7 @@ from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \ SOVOS, PICO from calibre.devices.sne.driver import SNE from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \ - GEMEI, VELOCITYMICRO, PDNOVEL_KOBO + GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600 from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.kobo.driver import KOBO @@ -586,6 +586,7 @@ plugins += [ AVANT, MENTOR, SWEEX, + Q600, KOGAN, PDNOVEL, SPECTRA, diff --git a/src/calibre/devices/misc.py b/src/calibre/devices/misc.py index 92e26d47e4..af5a77ce03 100644 --- a/src/calibre/devices/misc.py +++ b/src/calibre/devices/misc.py @@ -72,6 +72,15 @@ class SWEEX(USBMS): EBOOK_DIR_MAIN = '' SUPPORTS_SUB_DIRS = True +class Q600(SWEEX): + + name = 'Digma Q600 Device interface' + gui_name = 'Q600' + description = _('Communicate with the Digma Q600') + + BCD = [0x325] + FORMATS = ['epub', 'fb2', 'mobi', 'prc', 'html', 'rtf', 'chm', 'pdf', 'txt'] + class KOGAN(SWEEX): name = 'Kogan Device Interface' From e268beaa9081a2b9afc13dbe32471e43817ad88d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 11:48:22 -0600 Subject: [PATCH 09/19] Fix #7362 (7.26 freezing) --- src/calibre/ebooks/metadata/amazon.py | 14 +- src/calibre/library/comments.py | 12 + src/calibre/utils/html2text.py | 451 ++++++++++++++++++++++++++ 3 files changed, 471 insertions(+), 6 deletions(-) create mode 100644 src/calibre/utils/html2text.py diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index a8ff0f1ad0..e61e0b2748 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -14,6 +14,7 @@ from calibre import browser from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode +from calibre.library.comments import sanitize_comments_html def find_asin(br, isbn): q = 'http://www.amazon.com/s?field-keywords='+isbn @@ -95,25 +96,26 @@ def get_metadata(br, asin, mi): # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace - desc = re.sub('\n+', '\n', desc) - desc = re.sub(' +', ' ', desc) + #desc = re.sub('\n+', '\n', desc) + #desc = re.sub(' +', ' ', desc) # Remove the notice about text referring to out of print editions desc = re.sub(r'(?s)--This text ref.*?', '', desc) # Remove comments desc = re.sub(r'(?s)', '', desc) - mi.comments = desc + mi.comments = sanitize_comments_html(desc) return True def main(args=sys.argv): # Test xisbn - print get_social_metadata('Learning Python', None, None, '8324616489') - print + #print get_social_metadata('Learning Python', None, None, '8324616489') + #print # Test sophisticated comment formatting - print get_social_metadata('Swan Thieves', None, None, '9780316065795') + print get_social_metadata('Swan Thieves', None, None, '9781416580829') print + return # Random tests print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720') diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py index 670d9f2564..45d6ccaa45 100644 --- a/src/calibre/library/comments.py +++ b/src/calibre/library/comments.py @@ -11,11 +11,15 @@ from calibre.constants import preferred_encoding from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \ CData, Comment, Declaration, ProcessingInstruction from calibre import prepare_string_for_xml +from calibre.utils.html2text import html2text +from calibre.ebooks.markdown import markdown # Hackish - ignoring sentences ending or beginning in numbers to avoid # confusion with decimal points. lost_cr_pat = re.compile('([a-z])([\.\?!])([A-Z])') lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])') +sanitize_pat = re.compile(r'', 'larr':'<-', 'middot':'*', +'ndash':'-', 'oelig':'oe', 'aelig':'ae', +'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', +'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', +'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', +'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', +'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} + +unifiable_n = {} + +for k in unifiable.keys(): + unifiable_n[name2cp(k)] = unifiable[k] + +def charref(name): + if name[0] in ['x','X']: + c = int(name[1:], 16) + else: + c = int(name) + + if not UNICODE_SNOB and c in unifiable_n.keys(): + return unifiable_n[c] + else: + return unichr(c) + +def entityref(c): + if not UNICODE_SNOB and c in unifiable.keys(): + return unifiable[c] + else: + try: name2cp(c) + except KeyError: return "&" + c + else: return unichr(name2cp(c)) + +def replaceEntities(s): + s = s.group(1) + if s[0] == "#": + return charref(s[1:]) + else: return entityref(s) + +r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") +def unescape(s): + return r_unescape.sub(replaceEntities, s) + +def fixattrs(attrs): + # Fix bug in sgmllib.py + if not attrs: return attrs + newattrs = [] + for attr in attrs: + newattrs.append((attr[0], unescape(attr[1]))) + return newattrs + +### End Entity Nonsense ### + +def onlywhite(line): + """Return true if the line does only consist of whitespace characters.""" + for c in line: + if c is not ' ' and c is not ' ': + return c is ' ' + return line + +def optwrap(text): + """Wrap all paragraphs in the provided text.""" + if not BODY_WIDTH: + return text + + assert wrap, "Requires Python 2.3." + result = '' + newlines = 0 + for para in text.split("\n"): + if len(para) > 0: + if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': + for line in wrap(para, BODY_WIDTH): + result += line + "\n" + result += "\n" + newlines = 2 + else: + if not onlywhite(para): + result += para + "\n" + newlines = 1 + else: + if newlines < 2: + result += "\n" + newlines += 1 + return result + +def hn(tag): + if tag[0] == 'h' and len(tag) == 2: + try: + n = int(tag[1]) + if n in range(1, 10): return n + except ValueError: return 0 + +class _html2text(sgmllib.SGMLParser): + def __init__(self, out=None, baseurl=''): + sgmllib.SGMLParser.__init__(self) + + if out is None: self.out = self.outtextf + else: self.out = out + self.outtext = u'' + self.quiet = 0 + self.p_p = 0 + self.outcount = 0 + self.start = 1 + self.space = 0 + self.a = [] + self.astack = [] + self.acount = 0 + self.list = [] + self.blockquote = 0 + self.pre = 0 + self.startpre = 0 + self.lastWasNL = 0 + self.abbr_title = None # current abbreviation definition + self.abbr_data = None # last inner HTML (for abbr being defined) + self.abbr_list = {} # stack of abbreviations to write later + self.baseurl = baseurl + + def outtextf(self, s): + self.outtext += s + + def close(self): + sgmllib.SGMLParser.close(self) + + self.pbr() + self.o('', 0, 'end') + + return self.outtext + + def handle_charref(self, c): + self.o(charref(c)) + + def handle_entityref(self, c): + self.o(entityref(c)) + + def unknown_starttag(self, tag, attrs): + self.handle_tag(tag, attrs, 1) + + def unknown_endtag(self, tag): + self.handle_tag(tag, None, 0) + + def previousIndex(self, attrs): + """ returns the index of certain set of attributes (of a link) in the + self.a list + + If the set of attributes is not found, returns None + """ + if not attrs.has_key('href'): return None + + i = -1 + for a in self.a: + i += 1 + match = 0 + + if a.has_key('href') and a['href'] == attrs['href']: + if a.has_key('title') or attrs.has_key('title'): + if (a.has_key('title') and attrs.has_key('title') and + a['title'] == attrs['title']): + match = True + else: + match = True + + if match: return i + + def handle_tag(self, tag, attrs, start): + attrs = fixattrs(attrs) + + if hn(tag): + self.p() + if start: self.o(hn(tag)*"#" + ' ') + + if tag in ['p', 'div']: self.p() + + if tag == "br" and start: self.o(" \n") + + if tag == "hr" and start: + self.p() + self.o("* * *") + self.p() + + if tag in ["head", "style", 'script']: + if start: self.quiet += 1 + else: self.quiet -= 1 + + if tag in ["body"]: + self.quiet = 0 # sites like 9rules.com never close + + if tag == "blockquote": + if start: + self.p(); self.o('> ', 0, 1); self.start = 1 + self.blockquote += 1 + else: + self.blockquote -= 1 + self.p() + + if tag in ['em', 'i', 'u']: self.o("_") + if tag in ['strong', 'b']: self.o("**") + if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` + if tag == "abbr": + if start: + attrsD = {} + for (x, y) in attrs: attrsD[x] = y + attrs = attrsD + + self.abbr_title = None + self.abbr_data = '' + if attrs.has_key('title'): + self.abbr_title = attrs['title'] + else: + if self.abbr_title != None: + self.abbr_list[self.abbr_data] = self.abbr_title + self.abbr_title = None + self.abbr_data = '' + + if tag == "a": + if start: + attrsD = {} + for (x, y) in attrs: attrsD[x] = y + attrs = attrsD + if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): + self.astack.append(attrs) + self.o("[") + else: + self.astack.append(None) + else: + if self.astack: + a = self.astack.pop() + if a: + i = self.previousIndex(a) + if i is not None: + a = self.a[i] + else: + self.acount += 1 + a['count'] = self.acount + a['outcount'] = self.outcount + self.a.append(a) + self.o("][" + `a['count']` + "]") + + if tag == "img" and start: + attrsD = {} + for (x, y) in attrs: attrsD[x] = y + attrs = attrsD + if attrs.has_key('src'): + attrs['href'] = attrs['src'] + alt = attrs.get('alt', '') + i = self.previousIndex(attrs) + if i is not None: + attrs = self.a[i] + else: + self.acount += 1 + attrs['count'] = self.acount + attrs['outcount'] = self.outcount + self.a.append(attrs) + self.o("![") + self.o(alt) + self.o("]["+`attrs['count']`+"]") + + if tag == 'dl' and start: self.p() + if tag == 'dt' and not start: self.pbr() + if tag == 'dd' and start: self.o(' ') + if tag == 'dd' and not start: self.pbr() + + if tag in ["ol", "ul"]: + if start: + self.list.append({'name':tag, 'num':0}) + else: + if self.list: self.list.pop() + + self.p() + + if tag == 'li': + if start: + self.pbr() + if self.list: li = self.list[-1] + else: li = {'name':'ul', 'num':0} + self.o(" "*len(self.list)) #TODO: line up
    1. s > 9 correctly. + if li['name'] == "ul": self.o("* ") + elif li['name'] == "ol": + li['num'] += 1 + self.o(`li['num']`+". ") + self.start = 1 + else: + self.pbr() + + if tag in ["table", "tr"] and start: self.p() + if tag == 'td': self.pbr() + + if tag == "pre": + if start: + self.startpre = 1 + self.pre = 1 + else: + self.pre = 0 + self.p() + + def pbr(self): + if self.p_p == 0: self.p_p = 1 + + def p(self): self.p_p = 2 + + def o(self, data, puredata=0, force=0): + if self.abbr_data is not None: self.abbr_data += data + + if not self.quiet: + if puredata and not self.pre: + data = re.sub('\s+', ' ', data) + if data and data[0] == ' ': + self.space = 1 + data = data[1:] + if not data and not force: return + + if self.startpre: + #self.out(" :") #TODO: not output when already one there + self.startpre = 0 + + bq = (">" * self.blockquote) + if not (force and data and data[0] == ">") and self.blockquote: bq += " " + + if self.pre: + bq += " " + data = data.replace("\n", "\n"+bq) + + if self.start: + self.space = 0 + self.p_p = 0 + self.start = 0 + + if force == 'end': + # It's the end. + self.p_p = 0 + self.out("\n") + self.space = 0 + + + if self.p_p: + self.out(('\n'+bq)*self.p_p) + self.space = 0 + + if self.space: + if not self.lastWasNL: self.out(' ') + self.space = 0 + + if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): + if force == "end": self.out("\n") + + newa = [] + for link in self.a: + if self.outcount > link['outcount']: + self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href'])) + if link.has_key('title'): self.out(" ("+link['title']+")") + self.out("\n") + else: + newa.append(link) + + if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. + + self.a = newa + + if self.abbr_list and force == "end": + for abbr, definition in self.abbr_list.items(): + self.out(" *[" + abbr + "]: " + definition + "\n") + + self.p_p = 0 + self.out(data) + self.lastWasNL = data and data[-1] == '\n' + self.outcount += 1 + + def handle_data(self, data): + if r'\/script>' in data: self.quiet -= 1 + self.o(data, 1) + + def unknown_decl(self, data): pass + +def wrapwrite(text): sys.stdout.write(text.encode('utf8')) + +def html2text_file(html, out=wrapwrite, baseurl=''): + h = _html2text(out, baseurl) + h.feed(html) + h.feed("") + return h.close() + +def html2text(html, baseurl=''): + return optwrap(html2text_file(html, None, baseurl)) + +if __name__ == "__main__": + baseurl = '' + if sys.argv[1:]: + arg = sys.argv[1] + if arg.startswith('http://') or arg.startswith('https://'): + baseurl = arg + j = urllib.urlopen(baseurl) + try: + from feedparser import _getCharacterEncoding as enc + except ImportError: + enc = lambda x, y: ('utf-8', 1) + text = j.read() + encoding = enc(j.headers, text)[0] + if encoding == 'us-ascii': encoding = 'utf-8' + data = text.decode(encoding) + + else: + encoding = 'utf8' + if len(sys.argv) > 2: + encoding = sys.argv[2] + data = open(arg, 'r').read().decode(encoding) + else: + data = sys.stdin.read().decode('utf8') + wrapwrite(html2text(data, baseurl)) + From 92fe7d3725f7785278c4bd2dfd5ad81e290827f5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 12:01:16 -0600 Subject: [PATCH 10/19] Amazon metadata download plugin: Improved parsing of broken HTML --- src/calibre/ebooks/metadata/amazon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index e61e0b2748..5d7d0358f0 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -9,6 +9,7 @@ Fetch metadata using Amazon AWS import sys, re from lxml import html +from lxml.html import soupparser from calibre import browser from calibre.ebooks.metadata import check_isbn @@ -71,7 +72,7 @@ def get_metadata(br, asin, mi): return False raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] - root = html.fromstring(raw) + root = soupparser.fromstring(raw) ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]') if ratings: pat = re.compile(r'([0-9.]+) out of (\d+) stars') From 134fad20e0fb3d4defeb791a096499df590d72dd Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 12:45:39 -0600 Subject: [PATCH 11/19] Re-arrange send to device menu to make it harder to accidentally trigger the send and delete actions --- src/calibre/gui2/device.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index 78585d13b6..4e93335af6 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -484,17 +484,22 @@ class DeviceMenu(QMenu): # {{{ _('Storage Card B')), ] + later_menus = [] for menu in (self, self.set_default_menu): for actions, desc in ( (basic_actions, ''), + (specific_actions, _('Send specific format to')), (delete_actions, _('Send and delete from library')), - (specific_actions, _('Send specific format to')) ): mdest = menu if actions is not basic_actions: - mdest = menu.addMenu(desc) + mdest = QMenu(desc) self._memory.append(mdest) + later_menus.append(mdest) + if menu is self.set_default_menu: + menu.addMenu(mdest) + menu.addSeparator() for dest, delete, specific, icon, text in actions: action = DeviceAction(dest, delete, specific, icon, text, self) @@ -507,7 +512,7 @@ class DeviceMenu(QMenu): # {{{ action.a_s.connect(self.action_triggered) self.actions.append(action) mdest.addAction(action) - if actions is not specific_actions: + if actions is basic_actions: menu.addSeparator() da = config['default_send_to_device_action'] @@ -525,14 +530,21 @@ class DeviceMenu(QMenu): # {{{ self.group.triggered.connect(self.change_default_action) self.addSeparator() + self.addMenu(later_menus[0]) + self.addSeparator() + mitem = self.addAction(QIcon(I('eject.png')), _('Eject device')) mitem.setEnabled(False) mitem.triggered.connect(lambda x : self.disconnect_mounted_device.emit()) self.disconnect_mounted_device_action = mitem - self.addSeparator() + self.addMenu(self.set_default_menu) self.addSeparator() + + self.addMenu(later_menus[1]) + self.addSeparator() + annot = self.addAction(_('Fetch annotations (experimental)')) annot.setEnabled(False) annot.triggered.connect(lambda x : From 965826bef98dd750ac7056456478862bc2ff6a3d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 12:47:30 -0600 Subject: [PATCH 12/19] Correio da Manha by jmst --- resources/recipes/cm_journal.recipe | 44 +++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 resources/recipes/cm_journal.recipe diff --git a/resources/recipes/cm_journal.recipe b/resources/recipes/cm_journal.recipe new file mode 100644 index 0000000000..c47fb35775 --- /dev/null +++ b/resources/recipes/cm_journal.recipe @@ -0,0 +1,44 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class CMJornal_pt(BasicNewsRecipe): + title = 'Correio da Manha - Portugal' + __author__ = 'jmst' + description = 'As noticias de Portugal e do Mundo' + publisher = 'Cofina Media' + category = '' + oldest_article = 1 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + language = 'pt' + extra_css = ' .publish{font-style: italic; line-height: 1.2em; border-bottom: 1px dotted; padding: 5px 0} .entity{line-height: 1.2em} .overview{line-height:1.2em} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + keep_only_tags = [ + dict(name=['h2','h1']) + , dict(name='div', attrs={'class': ['news']}) + ] + + remove_tags = [ + dict(name=['object','embed','iframe']) + ,dict(name='a',attrs={'href':['#']}) + ] + + feeds = [ + (u'Actualidade' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000009-0000-0000-0000-000000000009' ) + ,(u'Portugal' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000010-0000-0000-0000-000000000010' ) + ,(u'Economia' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000011-0000-0000-0000-000000000011' ) + ,(u'Mundo' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000091-0000-0000-0000-000000000091' ) + ,(u'Desporto' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000012-0000-0000-0000-000000000012' ) + ,(u'TV & Media', u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000092-0000-0000-0000-000000000092') + ] + + def print_version(self, url): + return url.replace('noticia.aspx', 'Imprimir.aspx') + From b8240c99b9df56a5000201d5d880c8ff3cc8046d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 13:00:46 -0600 Subject: [PATCH 13/19] Clic_RBS by arvoredo --- resources/recipes/clic_rbs.recipe | 50 +++++++++++++++++++++++++++ src/calibre/ebooks/metadata/amazon.py | 2 +- 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 resources/recipes/clic_rbs.recipe diff --git a/resources/recipes/clic_rbs.recipe b/resources/recipes/clic_rbs.recipe new file mode 100644 index 0000000000..559dfa2000 --- /dev/null +++ b/resources/recipes/clic_rbs.recipe @@ -0,0 +1,50 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ClicRBS(BasicNewsRecipe): + title = u'ClicRBS' + language = 'pt' + __author__ = 'arvoredo' + oldest_article = 3 + max_articles_per_feed = 9 + cover_url = 'http://www.publicidade.clicrbs.com.br/clicrbs/imgs/logo_clic.gif' + + remove_tags = [ + dict(name='div', attrs={'class':['clic-barra-inner', 'botao-versao-mobile ']}) + ] + + remove_tags_before = dict(name='div ', attrs={'class':'descricao'}) + remove_tags_before = dict(name='div', attrs={'id':'glb-corpo'}) + remove_tags_before = dict(name='div', attrs={'class':'descricao'}) + remove_tags_before = dict(name='div', attrs={'class':'coluna'}) + remove_tags_after = dict(name='div', attrs={'class':'extra'}) + remove_tags_after = dict(name='div', attrs={'id':'links-patrocinados'}) + remove_tags_after = dict(name='h4', attrs={'class':'tipo-c comente'}) + remove_tags_after = dict(name='ul', attrs={'class':'lista'}) + + feeds = [ + (u'zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?uf=1&local=1&channel=13') + , (u'diariocatarinense.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?uf=2&local=18&channel=67') + , (u'Concursos e Emprego', u'http://g1.globo.com/Rss2/0,,AS0-9654,00.xml') + , (u'Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?channel=87&uf=1&local=1') + , (u'Economia, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=801&uf=1&local=1&channel=13') + , (u'Esportes, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=802&uf=1&local=1&channel=13') + , (u'Economia, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1180&channel=87&uf=1&local=1') + , (u'Política, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1185&channel=87&uf=1&local=1') + , (u'Mundo, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1184&channel=87&uf=1&local=1') + , (u'Catarinense, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=&theme=371&uf=2&channel=2') + , (u'Geral, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1183&channel=87&uf=1&local=1') + , (u'Estilo de Vida, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=805&uf=1&local=1&channel=13') + , (u'Corrida, Corrida, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1313&theme=15704&uf=1&channel=2') + , (u'Jornal de Santa Catarina, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?espid=159&uf=2&local=18') + , (u'Grêmio, Futebol, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=11&theme=65&uf=1&channel=2') + , (u'Velocidade, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1314&theme=2655&uf=1&channel=2') + ] + + extra_css = ''' + cite{color:#007BB5; font-size:xx-small; font-style:italic;} + body{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} + h3{font-size:large; color:#082963; font-weight:bold;} + #ident{color:#0179B4; font-size:xx-small;} + p{color:#000000;font-weight:normal;} + .commentario p{color:#007BB5; font-style:italic;} + ''' diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index 5d7d0358f0..9c89016e8b 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -114,7 +114,7 @@ def main(args=sys.argv): #print # Test sophisticated comment formatting - print get_social_metadata('Swan Thieves', None, None, '9781416580829') + print get_social_metadata('Angels & Demons', None, None, '9781416580829') print return From 803e9eb32069367839fc95e170b3024ac54b649b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 13:20:26 -0600 Subject: [PATCH 14/19] Revert bundled version of BeautifulSoup in windows build to 3.0.8 to improve parsing of broken HTML --- setup/installer/windows/notes.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup/installer/windows/notes.rst b/setup/installer/windows/notes.rst index a8ba41e8ff..545070f7ff 100644 --- a/setup/installer/windows/notes.rst +++ b/setup/installer/windows/notes.rst @@ -28,7 +28,9 @@ If there are no windows binaries already compiled for the version of python you Run the following command to install python dependencies:: - easy_install --always-unzip -U ipython mechanize BeautifulSoup pyreadline python-dateutil dnspython + easy_install --always-unzip -U ipython mechanize pyreadline python-dateutil dnspython + +Install BeautifulSoup 3.0.x manually into site-packages (3.1.x parses broken HTML very poorly) Qt -------- From a42f927f791ce682a72a98904bd4569b9ed1e9d5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 18:29:35 -0600 Subject: [PATCH 15/19] ... --- setup/server.py | 29 +++++++++++++++++++++++++---- src/calibre/library/comments.py | 8 +++++++- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/setup/server.py b/setup/server.py index 66cb6adf7b..0fea4ec733 100644 --- a/setup/server.py +++ b/setup/server.py @@ -5,7 +5,7 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import subprocess, tempfile, os, time, sys +import subprocess, tempfile, os, time, sys, telnetlib from threading import RLock from setup import Command @@ -28,7 +28,12 @@ else: def process_default(self, event): name = getattr(event, 'name', None) - if name and os.path.splitext(name)[1] == '.py': + if not name: + return + ext = os.path.splitext(name)[1] + reload = False + if ext == '.py': + reload = True print print name, 'changed' self.command.kill_server() @@ -36,6 +41,9 @@ else: print self.command.prompt, sys.stdout.flush() + if reload: + self.command.reload_browser(delay=1) + class Server(Command): @@ -75,6 +83,19 @@ class Server(Command): self.notifier.start() self.wdd = wm.add_watch(os.path.abspath('src'), mask, rec=True) + def reload_browser(self, delay=0.1): + time.sleep(delay) + try: + t = telnetlib.Telnet('localhost', 4242) + t.read_until("repl>") + t.write('BrowserReload();') + print t.read_until("repl>") + t.close() + except: + print 'Failed to reload browser' + import traceback + traceback.print_exc() + def run(self, opts): self.lock = RLock() tdir = tempfile.gettempdir() @@ -88,8 +109,8 @@ class Server(Command): first = True while True: self.launch_server() - if first: - pass + if not first: + self.reload_browser() first = False try: diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py index 45d6ccaa45..83eec89abe 100644 --- a/src/calibre/library/comments.py +++ b/src/calibre/library/comments.py @@ -58,7 +58,13 @@ def comments_to_html(comments): return '\n'.join(parts) if sanitize_pat.search(comments) is not None: - return sanitize_comments_html(comments) + try: + return sanitize_comments_html(comments) + except: + import traceback + traceback.print_exc() + return u'

      ' + # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', From bdf2cd48ddff2edb5b23bfbc971716ded8130994 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 19:24:01 -0600 Subject: [PATCH 16/19] ... --- resources/recipes/ming_pao.recipe | 51 +++++++++++++++---------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe index 6a61405698..162a3c774e 100644 --- a/resources/recipes/ming_pao.recipe +++ b/resources/recipes/ming_pao.recipe @@ -1,7 +1,9 @@ -cense__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2010, Eddie Lau' ''' modified from Singtao Toronto calibre recipe by rty +Change Log: +2010/10/31: skip repeated articles in section pages ''' import datetime @@ -23,42 +25,37 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe): recursions = 0 conversion_options = {'linearize_tables':True} masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' - keep_only_tags = [dict(name='h1'), dict(attrs={'id':['newscontent01','newscontent02']})] def get_fetchdate(self): dt_utc = datetime.datetime.utcnow() - # convert UTC to local hk time - dt_local = dt_utc - datetime.timedelta(-8.0/24) + # convert UTC to local hk time - at around HKT 5.30am, all news are available + dt_local = dt_utc - datetime.timedelta(-2.5/24) return dt_local.strftime("%Y%m%d") def parse_index(self): - feeds = [] - dateStr = self.get_fetchdate() - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) + feeds = [] + dateStr = self.get_fetchdate() + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) return feeds def parse_section(self, url): - dateStr = self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['bullet']}) - current_articles = [] - for i in divs: - a = i.find('a', href = True) - title = self.tag_to_string(a) - url = a.get('href', False) - url = 'http://news.mingpao.com/' + dateStr + '/' +url + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet']}) + current_articles = [] + included_urls = [] + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + if url not in included_urls: current_articles.append({'title': title, 'url': url, 'description':''}) - return current_articles - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll(width=True): - del item['width'] - return soup + included_urls.append(url) + return current_articles From 21731b3c046da70cdc63fa348f164b9d5f4218cc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 21:00:38 -0600 Subject: [PATCH 17/19] ... --- src/calibre/utils/html2text.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/calibre/utils/html2text.py b/src/calibre/utils/html2text.py index afe5a0aded..0eb84a3d38 100644 --- a/src/calibre/utils/html2text.py +++ b/src/calibre/utils/html2text.py @@ -9,7 +9,7 @@ __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] # Support decoded entities with unifiable. if not hasattr(__builtins__, 'True'): True, False = 1, 0 -import re, sys, urllib, htmlentitydefs, codecs, StringIO, types +import re, sys, urllib, htmlentitydefs, codecs import sgmllib import urlparse sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') @@ -18,17 +18,17 @@ try: from textwrap import wrap except: pass # Use Unicode characters instead of their ascii psuedo-replacements -UNICODE_SNOB = 0 +UNICODE_SNOB = 1 # Put the links after each paragraph instead of at the end. LINKS_EACH_PARAGRAPH = 0 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) -BODY_WIDTH = 78 +BODY_WIDTH = 0 # Don't show internal links (href="#local-anchor") -- corresponding link targets # won't be visible in the plain text file anyway. -SKIP_INTERNAL_LINKS = False +SKIP_INTERNAL_LINKS = True ### Entity Nonsense ### @@ -433,8 +433,9 @@ if __name__ == "__main__": j = urllib.urlopen(baseurl) try: from feedparser import _getCharacterEncoding as enc + enc except ImportError: - enc = lambda x, y: ('utf-8', 1) + enc = lambda x, y: ('utf-8', 1) text = j.read() encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' From 073bf833712d7827ebe2ecfcb0b36478ea75d878 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 22:22:49 -0600 Subject: [PATCH 18/19] El Faro de Vigo by Jefferson Frantz. Fixes #405 (New news feed) --- resources/recipes/el_faro.recipe | 77 ++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 resources/recipes/el_faro.recipe diff --git a/resources/recipes/el_faro.recipe b/resources/recipes/el_faro.recipe new file mode 100644 index 0000000000..ec1b74b5cb --- /dev/null +++ b/resources/recipes/el_faro.recipe @@ -0,0 +1,77 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ElFaroDeVigo(BasicNewsRecipe): + title = u'El Faro de Vigo' + oldest_article = 1 + max_articles_per_feed = 100 + __author__ = 'Jefferson Frantz' + description = 'Noticias de Vigo' + timefmt = ' [%d %b, %Y]' + language = 'es' + encoding = 'cp1252' + no_stylesheets = True + remove_javascript = True + + feeds = [ +## (u'Vigo', u'http://www.farodevigo.es/elementosInt/rss/1'), +## (u'Gran Vigo', u'http://www.farodevigo.es/elementosInt/rss/2'), + (u'Galicia', u'http://www.farodevigo.es/elementosInt/rss/4'), + (u'España', u'http://www.farodevigo.es/elementosInt/rss/6'), + (u'Mundo', u'http://www.farodevigo.es/elementosInt/rss/7'), +## (u'Opinión', u'http://www.farodevigo.es/elementosInt/rss/5'), + (u'Economía', u'http://www.farodevigo.es/elementosInt/rss/10'), + (u'Sociedad y Cultura', u'http://www.farodevigo.es/elementosInt/rss/8'), + (u'Sucesos', u'http://www.farodevigo.es/elementosInt/rss/9'), + (u'Deportes', u'http://www.farodevigo.es/elementosInt/rss/11'), + (u'Agenda', u'http://www.farodevigo.es/elementosInt/rss/21'), + (u'Gente', u'http://www.farodevigo.es/elementosInt/rss/24'), + (u'Televisión', u'http://www.farodevigo.es/elementosInt/rss/25'), + (u'Ciencia y Tecnología', u'http://www.farodevigo.es/elementosInt/rss/26')] + + extra_css = '''.noticia_texto{ font-family: sans-serif; font-size: medium; text-align: justify } + h1{font-family: serif; font-size: x-large; font-weight: bold; color: #000000; text-align: center} + h2{font-family: serif; font-size: medium; font-weight: bold; color: #000000; text-align: left} + .enlacenegrita10{font-family: serif; font-size: small; font-weight: bold; color: #000000; text-align: left} + .noticia_titular{font-family: serif; font-size: x-large; font-weight: bold; color: #000000; text-align: center}''' + + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + + url = 'http://estaticos00.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif' + fitem = soup.find('img',src=url) + if fitem: + par = fitem.parent + par.extract() + url = 'http://estaticos01.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif' + fitem = soup.find('img',src=url) + if fitem: + par = fitem.parent + par.extract() + url = 'http://estaticos02.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif' + fitem = soup.find('img',src=url) + if fitem: + par = fitem.parent + par.extract() + + return self.adeify_images(soup) + + def postprocess_html(self, soup, first_fetch): + divs = soup.findAll(True, {'class':'enlacenegrita10'}) + for div in divs: + div['align'] = 'left' + + return soup + + + keep_only_tags = [dict(name='div', attrs={'class':['noticias']})] + + remove_tags = [ + dict(name=['object','link','script','ul','iframe','ol']) + ,dict(name='div', attrs={'class':['noticiadd2', 'cintillo2', 'noticiadd', 'noticiadd2']}) + ,dict(name='div', attrs={'class':['imagen_derecha', 'noticiadd3', 'extraHTML']}) + + ] + + From 0c8684fa2191d1329860de55c364718c991db469 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 22:43:24 -0600 Subject: [PATCH 19/19] Fix #7369 (0.7.26) --- src/calibre/ebooks/metadata/amazon.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index 9c89016e8b..81d996c6a7 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -72,7 +72,10 @@ def get_metadata(br, asin, mi): return False raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] - root = soupparser.fromstring(raw) + try: + root = soupparser.fromstring(raw) + except: + return False ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]') if ratings: pat = re.compile(r'([0-9.]+) out of (\d+) stars')