diff --git a/resources/content_server/browse/browse.css b/resources/content_server/browse/browse.css index 92ed4c3ce6..1243795e55 100644 --- a/resources/content_server/browse/browse.css +++ b/resources/content_server/browse/browse.css @@ -208,6 +208,8 @@ h2.library_name { } +.toplevel li a { text-decoration: none; } + .toplevel li img { vertical-align: middle; margin-right: 1em; @@ -261,9 +263,16 @@ h2.library_name { } -.category div.category-item span.href { display: none } +.category div.category-item a { text-decoration: none; color: inherit; } -#groups span.load_href { display: none } +#groups a.load_href { + text-decoration: none; + color: inherit; + font-size: medium; + font-weight: normal; + padding: 0; + padding-left: 0.5em; +} #groups h3 { font-weight: bold; diff --git a/resources/content_server/browse/browse.js b/resources/content_server/browse/browse.js index 89ce679871..e0585a9afd 100644 --- a/resources/content_server/browse/browse.js +++ b/resources/content_server/browse/browse.js @@ -116,7 +116,7 @@ function toplevel() { $(".sort_select").hide(); $(".toplevel li").click(function() { - var href = $(this).children("span.url").text(); + var href = $(this).children("a").attr('href'); window.location = href; }); @@ -133,7 +133,7 @@ function render_error(msg) { // Category feed {{{ function category_clicked() { - var href = $(this).find("span.href").html(); + var href = $(this).find("a").attr('href'); window.location = href; } @@ -151,7 +151,7 @@ function category() { change: function(event, ui) { if (ui.newContent) { - var href = ui.newContent.children("span.load_href").html(); + var href = ui.newContent.prev().children("a.load_href").attr('href'); ui.newContent.children(".loading").show(); if (href) { $.ajax({ diff --git a/resources/content_server/monocle.js b/resources/content_server/read/monocle.js similarity index 100% rename from resources/content_server/monocle.js rename to resources/content_server/read/monocle.js diff --git a/resources/recipes/clic_rbs.recipe b/resources/recipes/clic_rbs.recipe new file mode 100644 index 0000000000..559dfa2000 --- /dev/null +++ b/resources/recipes/clic_rbs.recipe @@ -0,0 +1,50 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ClicRBS(BasicNewsRecipe): + title = u'ClicRBS' + language = 'pt' + __author__ = 'arvoredo' + oldest_article = 3 + max_articles_per_feed = 9 + cover_url = 'http://www.publicidade.clicrbs.com.br/clicrbs/imgs/logo_clic.gif' + + remove_tags = [ + dict(name='div', attrs={'class':['clic-barra-inner', 'botao-versao-mobile ']}) + ] + + remove_tags_before = dict(name='div ', attrs={'class':'descricao'}) + remove_tags_before = dict(name='div', attrs={'id':'glb-corpo'}) + remove_tags_before = dict(name='div', attrs={'class':'descricao'}) + remove_tags_before = dict(name='div', attrs={'class':'coluna'}) + remove_tags_after = dict(name='div', attrs={'class':'extra'}) + remove_tags_after = dict(name='div', attrs={'id':'links-patrocinados'}) + remove_tags_after = dict(name='h4', attrs={'class':'tipo-c comente'}) + remove_tags_after = dict(name='ul', attrs={'class':'lista'}) + + feeds = [ + (u'zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?uf=1&local=1&channel=13') + , (u'diariocatarinense.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?uf=2&local=18&channel=67') + , (u'Concursos e Emprego', u'http://g1.globo.com/Rss2/0,,AS0-9654,00.xml') + , (u'Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?channel=87&uf=1&local=1') + , (u'Economia, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=801&uf=1&local=1&channel=13') + , (u'Esportes, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=802&uf=1&local=1&channel=13') + , (u'Economia, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1180&channel=87&uf=1&local=1') + , (u'Política, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1185&channel=87&uf=1&local=1') + , (u'Mundo, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1184&channel=87&uf=1&local=1') + , (u'Catarinense, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=&theme=371&uf=2&channel=2') + , (u'Geral, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1183&channel=87&uf=1&local=1') + , (u'Estilo de Vida, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=805&uf=1&local=1&channel=13') + , (u'Corrida, Corrida, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1313&theme=15704&uf=1&channel=2') + , (u'Jornal de Santa Catarina, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?espid=159&uf=2&local=18') + , (u'Grêmio, Futebol, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=11&theme=65&uf=1&channel=2') + , (u'Velocidade, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1314&theme=2655&uf=1&channel=2') + ] + + extra_css = ''' + cite{color:#007BB5; font-size:xx-small; font-style:italic;} + body{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} + h3{font-size:large; color:#082963; font-weight:bold;} + #ident{color:#0179B4; font-size:xx-small;} + p{color:#000000;font-weight:normal;} + .commentario p{color:#007BB5; font-style:italic;} + ''' diff --git a/resources/recipes/cm_journal.recipe b/resources/recipes/cm_journal.recipe new file mode 100644 index 0000000000..c47fb35775 --- /dev/null +++ b/resources/recipes/cm_journal.recipe @@ -0,0 +1,44 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class CMJornal_pt(BasicNewsRecipe): + title = 'Correio da Manha - Portugal' + __author__ = 'jmst' + description = 'As noticias de Portugal e do Mundo' + publisher = 'Cofina Media' + category = '' + oldest_article = 1 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + language = 'pt' + extra_css = ' .publish{font-style: italic; line-height: 1.2em; border-bottom: 1px dotted; padding: 5px 0} .entity{line-height: 1.2em} .overview{line-height:1.2em} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + keep_only_tags = [ + dict(name=['h2','h1']) + , dict(name='div', attrs={'class': ['news']}) + ] + + remove_tags = [ + dict(name=['object','embed','iframe']) + ,dict(name='a',attrs={'href':['#']}) + ] + + feeds = [ + (u'Actualidade' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000009-0000-0000-0000-000000000009' ) + ,(u'Portugal' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000010-0000-0000-0000-000000000010' ) + ,(u'Economia' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000011-0000-0000-0000-000000000011' ) + ,(u'Mundo' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000091-0000-0000-0000-000000000091' ) + ,(u'Desporto' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000012-0000-0000-0000-000000000012' ) + ,(u'TV & Media', u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000092-0000-0000-0000-000000000092') + ] + + def print_version(self, url): + return url.replace('noticia.aspx', 'Imprimir.aspx') + diff --git a/resources/recipes/el_faro.recipe b/resources/recipes/el_faro.recipe new file mode 100644 index 0000000000..ec1b74b5cb --- /dev/null +++ b/resources/recipes/el_faro.recipe @@ -0,0 +1,77 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ElFaroDeVigo(BasicNewsRecipe): + title = u'El Faro de Vigo' + oldest_article = 1 + max_articles_per_feed = 100 + __author__ = 'Jefferson Frantz' + description = 'Noticias de Vigo' + timefmt = ' [%d %b, %Y]' + language = 'es' + encoding = 'cp1252' + no_stylesheets = True + remove_javascript = True + + feeds = [ +## (u'Vigo', u'http://www.farodevigo.es/elementosInt/rss/1'), +## (u'Gran Vigo', u'http://www.farodevigo.es/elementosInt/rss/2'), + (u'Galicia', u'http://www.farodevigo.es/elementosInt/rss/4'), + (u'España', u'http://www.farodevigo.es/elementosInt/rss/6'), + (u'Mundo', u'http://www.farodevigo.es/elementosInt/rss/7'), +## (u'Opinión', u'http://www.farodevigo.es/elementosInt/rss/5'), + (u'Economía', u'http://www.farodevigo.es/elementosInt/rss/10'), + (u'Sociedad y Cultura', u'http://www.farodevigo.es/elementosInt/rss/8'), + (u'Sucesos', u'http://www.farodevigo.es/elementosInt/rss/9'), + (u'Deportes', u'http://www.farodevigo.es/elementosInt/rss/11'), + (u'Agenda', u'http://www.farodevigo.es/elementosInt/rss/21'), + (u'Gente', u'http://www.farodevigo.es/elementosInt/rss/24'), + (u'Televisión', u'http://www.farodevigo.es/elementosInt/rss/25'), + (u'Ciencia y Tecnología', u'http://www.farodevigo.es/elementosInt/rss/26')] + + extra_css = '''.noticia_texto{ font-family: sans-serif; font-size: medium; text-align: justify } + h1{font-family: serif; font-size: x-large; font-weight: bold; color: #000000; text-align: center} + h2{font-family: serif; font-size: medium; font-weight: bold; color: #000000; text-align: left} + .enlacenegrita10{font-family: serif; font-size: small; font-weight: bold; color: #000000; text-align: left} + .noticia_titular{font-family: serif; font-size: x-large; font-weight: bold; color: #000000; text-align: center}''' + + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + + url = 'http://estaticos00.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif' + fitem = soup.find('img',src=url) + if fitem: + par = fitem.parent + par.extract() + url = 'http://estaticos01.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif' + fitem = soup.find('img',src=url) + if fitem: + par = fitem.parent + par.extract() + url = 'http://estaticos02.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif' + fitem = soup.find('img',src=url) + if fitem: + par = fitem.parent + par.extract() + + return self.adeify_images(soup) + + def postprocess_html(self, soup, first_fetch): + divs = soup.findAll(True, {'class':'enlacenegrita10'}) + for div in divs: + div['align'] = 'left' + + return soup + + + keep_only_tags = [dict(name='div', attrs={'class':['noticias']})] + + remove_tags = [ + dict(name=['object','link','script','ul','iframe','ol']) + ,dict(name='div', attrs={'class':['noticiadd2', 'cintillo2', 'noticiadd', 'noticiadd2']}) + ,dict(name='div', attrs={'class':['imagen_derecha', 'noticiadd3', 'extraHTML']}) + + ] + + diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe index 6a61405698..162a3c774e 100644 --- a/resources/recipes/ming_pao.recipe +++ b/resources/recipes/ming_pao.recipe @@ -1,7 +1,9 @@ -cense__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2010, Eddie Lau' ''' modified from Singtao Toronto calibre recipe by rty +Change Log: +2010/10/31: skip repeated articles in section pages ''' import datetime @@ -23,42 +25,37 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe): recursions = 0 conversion_options = {'linearize_tables':True} masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' - keep_only_tags = [dict(name='h1'), dict(attrs={'id':['newscontent01','newscontent02']})] def get_fetchdate(self): dt_utc = datetime.datetime.utcnow() - # convert UTC to local hk time - dt_local = dt_utc - datetime.timedelta(-8.0/24) + # convert UTC to local hk time - at around HKT 5.30am, all news are available + dt_local = dt_utc - datetime.timedelta(-2.5/24) return dt_local.strftime("%Y%m%d") def parse_index(self): - feeds = [] - dateStr = self.get_fetchdate() - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) + feeds = [] + dateStr = self.get_fetchdate() + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) return feeds def parse_section(self, url): - dateStr = self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['bullet']}) - current_articles = [] - for i in divs: - a = i.find('a', href = True) - title = self.tag_to_string(a) - url = a.get('href', False) - url = 'http://news.mingpao.com/' + dateStr + '/' +url + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet']}) + current_articles = [] + included_urls = [] + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + if url not in included_urls: current_articles.append({'title': title, 'url': url, 'description':''}) - return current_articles - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll(width=True): - del item['width'] - return soup + included_urls.append(url) + return current_articles diff --git a/setup/installer/windows/notes.rst b/setup/installer/windows/notes.rst index a8ba41e8ff..545070f7ff 100644 --- a/setup/installer/windows/notes.rst +++ b/setup/installer/windows/notes.rst @@ -28,7 +28,9 @@ If there are no windows binaries already compiled for the version of python you Run the following command to install python dependencies:: - easy_install --always-unzip -U ipython mechanize BeautifulSoup pyreadline python-dateutil dnspython + easy_install --always-unzip -U ipython mechanize pyreadline python-dateutil dnspython + +Install BeautifulSoup 3.0.x manually into site-packages (3.1.x parses broken HTML very poorly) Qt -------- diff --git a/setup/server.py b/setup/server.py index 2103f4805a..0fea4ec733 100644 --- a/setup/server.py +++ b/setup/server.py @@ -5,10 +5,46 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import subprocess, tempfile, os, time +import subprocess, tempfile, os, time, sys, telnetlib +from threading import RLock from setup import Command +try: + from pyinotify import WatchManager, ThreadedNotifier, EventsCodes, ProcessEvent +except: + wm = None +else: + wm = WatchManager() + flags = EventsCodes.ALL_FLAGS + mask = flags['IN_MODIFY'] + + class ProcessEvents(ProcessEvent): + + def __init__(self, command): + ProcessEvent.__init__(self) + self.command = command + + def process_default(self, event): + name = getattr(event, + 'name', None) + if not name: + return + ext = os.path.splitext(name)[1] + reload = False + if ext == '.py': + reload = True + print + print name, 'changed' + self.command.kill_server() + self.command.launch_server() + print self.command.prompt, + sys.stdout.flush() + + if reload: + self.command.reload_browser(delay=1) + + class Server(Command): description = 'Run the calibre server in development mode conveniently' @@ -18,31 +54,75 @@ class Server(Command): def rebuild_monocole(self): subprocess.check_call(['sprocketize', '-C', self.MONOCLE_PATH, '-I', 'src', 'src/monocle.js'], - stdout=open('resources/content_server/monocle.js', 'wb')) + stdout=open('resources/content_server/read/monocle.js', 'wb')) - def launch_server(self, log): - self.rebuild_monocole() - p = subprocess.Popen(['calibre-server', '--develop'], - stderr=subprocess.STDOUT, stdout=log) - return p + def launch_server(self): + print 'Starting server...\n' + with self.lock: + self.rebuild_monocole() + self.server_proc = p = subprocess.Popen(['calibre-server', '--develop'], + stderr=subprocess.STDOUT, stdout=self.server_log) + time.sleep(0.2) + if p.poll() is not None: + print 'Starting server failed' + raise SystemExit(1) + return p + + def kill_server(self): + print 'Killing server...\n' + if self.server_proc is not None: + with self.lock: + if self.server_proc.poll() is None: + self.server_proc.terminate() + while self.server_proc.poll() is None: + time.sleep(0.1) + + def watch(self): + if wm is not None: + self.notifier = ThreadedNotifier(wm, ProcessEvents(self)) + self.notifier.start() + self.wdd = wm.add_watch(os.path.abspath('src'), mask, rec=True) + + def reload_browser(self, delay=0.1): + time.sleep(delay) + try: + t = telnetlib.Telnet('localhost', 4242) + t.read_until("repl>") + t.write('BrowserReload();') + print t.read_until("repl>") + t.close() + except: + print 'Failed to reload browser' + import traceback + traceback.print_exc() def run(self, opts): + self.lock = RLock() tdir = tempfile.gettempdir() logf = os.path.join(tdir, 'calibre-server.log') - log = open(logf, 'ab') + self.server_log = open(logf, 'ab') + self.prompt = 'Press Enter to kill/restart server. Ctrl+C to quit: ' print 'Server log available at:', logf + print + self.watch() + first = True while True: - print 'Starting server...' - p = self.launch_server(log) + self.launch_server() + if not first: + self.reload_browser() + first = False + try: - raw_input('Press Enter to kill/restart server. Ctrl+C to quit: ') + raw_input(self.prompt) except: + print + self.kill_server() break else: - while p.returncode is None: - p.terminate() - time.sleep(0.1) - p.kill() + self.kill_server() print + if hasattr(self, 'notifier'): + self.notifier.stop() + diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 2945cc6604..3cc84f248d 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -475,7 +475,7 @@ from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \ SOVOS, PICO from calibre.devices.sne.driver import SNE from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \ - GEMEI, VELOCITYMICRO, PDNOVEL_KOBO + GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600 from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.kobo.driver import KOBO @@ -586,6 +586,7 @@ plugins += [ AVANT, MENTOR, SWEEX, + Q600, KOGAN, PDNOVEL, SPECTRA, diff --git a/src/calibre/devices/apple/driver.py b/src/calibre/devices/apple/driver.py index 9ad3cf3e08..74fa868255 100644 --- a/src/calibre/devices/apple/driver.py +++ b/src/calibre/devices/apple/driver.py @@ -19,7 +19,7 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.epub import set_metadata from calibre.library.server.utils import strftime from calibre.utils.config import config_dir, prefs -from calibre.utils.date import isoformat, now, parse_date +from calibre.utils.date import now, parse_date from calibre.utils.logging import Log from calibre.utils.zipfile import ZipFile @@ -2521,11 +2521,11 @@ class ITUNES(DriverBase): metadata.timestamp = datetime.datetime(old_ts.year, old_ts.month, old_ts.day, old_ts.hour, old_ts.minute, old_ts.second, old_ts.microsecond+1, old_ts.tzinfo) else: - metadata.timestamp = isoformat(now()) + metadata.timestamp = now() if DEBUG: self.log.info(" add timestamp: %s" % metadata.timestamp) else: - metadata.timestamp = isoformat(now()) + metadata.timestamp = now() if DEBUG: self.log.warning(" missing block in OPF file") self.log.info(" add timestamp: %s" % metadata.timestamp) diff --git a/src/calibre/devices/misc.py b/src/calibre/devices/misc.py index 92e26d47e4..af5a77ce03 100644 --- a/src/calibre/devices/misc.py +++ b/src/calibre/devices/misc.py @@ -72,6 +72,15 @@ class SWEEX(USBMS): EBOOK_DIR_MAIN = '' SUPPORTS_SUB_DIRS = True +class Q600(SWEEX): + + name = 'Digma Q600 Device interface' + gui_name = 'Q600' + description = _('Communicate with the Digma Q600') + + BCD = [0x325] + FORMATS = ['epub', 'fb2', 'mobi', 'prc', 'html', 'rtf', 'chm', 'pdf', 'txt'] + class KOGAN(SWEEX): name = 'Kogan Device Interface' diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index a8ff0f1ad0..81d996c6a7 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -9,11 +9,13 @@ Fetch metadata using Amazon AWS import sys, re from lxml import html +from lxml.html import soupparser from calibre import browser from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode +from calibre.library.comments import sanitize_comments_html def find_asin(br, isbn): q = 'http://www.amazon.com/s?field-keywords='+isbn @@ -70,7 +72,10 @@ def get_metadata(br, asin, mi): return False raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] - root = html.fromstring(raw) + try: + root = soupparser.fromstring(raw) + except: + return False ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]') if ratings: pat = re.compile(r'([0-9.]+) out of (\d+) stars') @@ -95,25 +100,26 @@ def get_metadata(br, asin, mi): # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace - desc = re.sub('\n+', '\n', desc) - desc = re.sub(' +', ' ', desc) + #desc = re.sub('\n+', '\n', desc) + #desc = re.sub(' +', ' ', desc) # Remove the notice about text referring to out of print editions desc = re.sub(r'(?s)--This text ref.*?', '', desc) # Remove comments desc = re.sub(r'(?s)', '', desc) - mi.comments = desc + mi.comments = sanitize_comments_html(desc) return True def main(args=sys.argv): # Test xisbn - print get_social_metadata('Learning Python', None, None, '8324616489') - print + #print get_social_metadata('Learning Python', None, None, '8324616489') + #print # Test sophisticated comment formatting - print get_social_metadata('Swan Thieves', None, None, '9780316065795') + print get_social_metadata('Angels & Demons', None, None, '9781416580829') print + return # Random tests print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720') diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index 78585d13b6..4e93335af6 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -484,17 +484,22 @@ class DeviceMenu(QMenu): # {{{ _('Storage Card B')), ] + later_menus = [] for menu in (self, self.set_default_menu): for actions, desc in ( (basic_actions, ''), + (specific_actions, _('Send specific format to')), (delete_actions, _('Send and delete from library')), - (specific_actions, _('Send specific format to')) ): mdest = menu if actions is not basic_actions: - mdest = menu.addMenu(desc) + mdest = QMenu(desc) self._memory.append(mdest) + later_menus.append(mdest) + if menu is self.set_default_menu: + menu.addMenu(mdest) + menu.addSeparator() for dest, delete, specific, icon, text in actions: action = DeviceAction(dest, delete, specific, icon, text, self) @@ -507,7 +512,7 @@ class DeviceMenu(QMenu): # {{{ action.a_s.connect(self.action_triggered) self.actions.append(action) mdest.addAction(action) - if actions is not specific_actions: + if actions is basic_actions: menu.addSeparator() da = config['default_send_to_device_action'] @@ -525,14 +530,21 @@ class DeviceMenu(QMenu): # {{{ self.group.triggered.connect(self.change_default_action) self.addSeparator() + self.addMenu(later_menus[0]) + self.addSeparator() + mitem = self.addAction(QIcon(I('eject.png')), _('Eject device')) mitem.setEnabled(False) mitem.triggered.connect(lambda x : self.disconnect_mounted_device.emit()) self.disconnect_mounted_device_action = mitem - self.addSeparator() + self.addMenu(self.set_default_menu) self.addSeparator() + + self.addMenu(later_menus[1]) + self.addSeparator() + annot = self.addAction(_('Fetch annotations (experimental)')) annot.setEnabled(False) annot.triggered.connect(lambda x : diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py index 670d9f2564..83eec89abe 100644 --- a/src/calibre/library/comments.py +++ b/src/calibre/library/comments.py @@ -11,11 +11,15 @@ from calibre.constants import preferred_encoding from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \ CData, Comment, Declaration, ProcessingInstruction from calibre import prepare_string_for_xml +from calibre.utils.html2text import html2text +from calibre.ebooks.markdown import markdown # Hackish - ignoring sentences ending or beginning in numbers to avoid # confusion with decimal points. lost_cr_pat = re.compile('([a-z])([\.\?!])([A-Z])') lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])') +sanitize_pat = re.compile(r'

' + + # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', '.\r'), comments) @@ -115,6 +128,11 @@ def comments_to_html(comments): return result.renderContents(encoding=None) +def sanitize_comments_html(html): + text = html2text(html) + md = markdown.Markdown(safe_mode=True) + return md.convert(text) + def test(): for pat, val in [ ('lineone\n\nlinetwo', diff --git a/src/calibre/library/server/base.py b/src/calibre/library/server/base.py index c9025a28f8..29636c5659 100644 --- a/src/calibre/library/server/base.py +++ b/src/calibre/library/server/base.py @@ -118,16 +118,17 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache, self.set_database(db) cherrypy.config.update({ - 'log.screen' : opts.develop, - 'engine.autoreload_on' : opts.develop, - 'tools.log_headers.on' : opts.develop, - 'checker.on' : opts.develop, - 'request.show_tracebacks': show_tracebacks, - 'server.socket_host' : listen_on, - 'server.socket_port' : opts.port, - 'server.socket_timeout' : opts.timeout, #seconds - 'server.thread_pool' : opts.thread_pool, # number of threads - }) + 'log.screen' : opts.develop, + 'engine.autoreload_on' : getattr(opts, + 'auto_reload', False), + 'tools.log_headers.on' : opts.develop, + 'checker.on' : opts.develop, + 'request.show_tracebacks': show_tracebacks, + 'server.socket_host' : listen_on, + 'server.socket_port' : opts.port, + 'server.socket_timeout' : opts.timeout, #seconds + 'server.thread_pool' : opts.thread_pool, # number of threads + }) if embedded or wsgi: cherrypy.config.update({'engine.SIGHUP' : None, 'engine.SIGTERM' : None,}) diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py index 9c442acc11..9530a34c73 100644 --- a/src/calibre/library/server/browse.py +++ b/src/calibre/library/server/browse.py @@ -123,9 +123,10 @@ def get_category_items(category, items, restriction, datatype, prefix): # {{{ def item(i): templ = (u'
' - '
{0}
{1}
' - '
{2}' - '{5}{3}
') + '
' + '{0}
' + '
{1}
' + '
{2}
') rating, rstring = render_rating(i.avg_rating, prefix) name = xml(i.name) if datatype == 'rating': @@ -142,7 +143,7 @@ def get_category_items(category, items, restriction, datatype, prefix): # {{{ q = category href = '/browse/matches/%s/%s'%(quote(q), quote(id_)) return templ.format(xml(name), rating, - xml(desc), xml(href), rstring, prefix) + xml(desc), xml(href, True), rstring, prefix) items = list(map(item, items)) return '\n'.join(['
'] + items + ['
']) @@ -252,8 +253,6 @@ class BrowseServer(object): lp = self.db.library_path if isbytestring(lp): lp = force_unicode(lp, filesystem_encoding) - if isinstance(ans, unicode): - ans = ans.encode('utf-8') ans = ans.replace('{library_name}', xml(os.path.basename(lp))) ans = ans.replace('{library_path}', xml(lp, True)) ans = ans.replace('{initial_search}', initial_search) @@ -335,9 +334,10 @@ class BrowseServer(object): icon = 'blank.png' cats.append((meta['name'], category, icon)) - cats = [('
  • {0}' + cats = [('
  •  ' + '{0}' '{0}' - '{3}/browse/category/{1}
  • ') + '') .format(xml(x, True), xml(quote(y)), xml(_('Browse books by')), self.opts.url_prefix, src='/browse/icon/'+z) for x, y, z in cats] @@ -393,14 +393,15 @@ class BrowseServer(object): for x in sorted(starts): category_groups[x] = len([y for y in items if getter(y).upper().startswith(x)]) - items = [(u'

    {0} [{2}]

    ' + items = [(u'

    {0} [{2}]

    ' u'' u'
    {1}{1}
    ' - u'{4}{3}
    ').format( + u'
    ').format( xml(s, True), xml(_('Loading, please wait'))+'…', unicode(c), - xml(u'/browse/category_group/%s/%s'%(category, s)), + xml(u'/browse/category_group/%s/%s'%(category, s), True), self.opts.url_prefix) for s, c in category_groups.items()] items = '\n\n'.join(items) @@ -563,7 +564,8 @@ class BrowseServer(object): if not val: val = '' args[key] = xml(val, True) - fname = ascii_filename(args['title']) + ' - ' + ascii_filename(args['authors']) + fname = quote(ascii_filename(args['title']) + ' - ' + + ascii_filename(args['authors'])) return args, fmt, fmts, fname @Endpoint(mimetype='application/json; charset=utf-8') diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index 670c31b9df..6437f02cb6 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -70,10 +70,10 @@ class ContentServer(object): id = id.rpartition('_')[-1].partition('.')[0] match = re.search(r'\d+', id) if not match: - raise cherrypy.HTTPError(400, 'id:%s not an integer'%id) + raise cherrypy.HTTPError(404, 'id:%s not an integer'%id) id = int(match.group()) if not self.db.has_id(id): - raise cherrypy.HTTPError(400, 'id:%d does not exist in database'%id) + raise cherrypy.HTTPError(404, 'id:%d does not exist in database'%id) if what == 'thumb' or what.startswith('thumb_'): try: width, height = map(int, what.split('_')[1:]) diff --git a/src/calibre/library/server/main.py b/src/calibre/library/server/main.py index fbd811a1ab..b7cb3ecf12 100644 --- a/src/calibre/library/server/main.py +++ b/src/calibre/library/server/main.py @@ -58,6 +58,9 @@ The OPDS interface is advertised via BonJour automatically. help=_('Specifies a restriction to be used for this invocation. ' 'This option overrides any per-library settings specified' ' in the GUI')) + parser.add_option('--auto-reload', default=False, action='store_true', + help=_('Auto reload server when source code changes. May not' + ' work in all environments.')) return parser diff --git a/src/calibre/library/server/mobile.py b/src/calibre/library/server/mobile.py index a889089109..d66e6d842f 100644 --- a/src/calibre/library/server/mobile.py +++ b/src/calibre/library/server/mobile.py @@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en' import re, os import __builtin__ +from urllib import quote import cherrypy from lxml import html @@ -115,8 +116,8 @@ def build_index(books, num, search, sort, order, start, total, url_base, CKEYS, data = TD() for fmt in book['formats'].split(','): - a = ascii_filename(book['authors']) - t = ascii_filename(book['title']) + a = quote(ascii_filename(book['authors'])) + t = quote(ascii_filename(book['title'])) s = SPAN( A( fmt.lower(), diff --git a/src/calibre/utils/html2text.py b/src/calibre/utils/html2text.py new file mode 100644 index 0000000000..0eb84a3d38 --- /dev/null +++ b/src/calibre/utils/html2text.py @@ -0,0 +1,452 @@ +#!/usr/bin/env python +"""html2text: Turn HTML into equivalent Markdown-structured text.""" +__version__ = "2.39" +__author__ = "Aaron Swartz (me@aaronsw.com)" +__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." +__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] + +# TODO: +# Support decoded entities with unifiable. + +if not hasattr(__builtins__, 'True'): True, False = 1, 0 +import re, sys, urllib, htmlentitydefs, codecs +import sgmllib +import urlparse +sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') + +try: from textwrap import wrap +except: pass + +# Use Unicode characters instead of their ascii psuedo-replacements +UNICODE_SNOB = 1 + +# Put the links after each paragraph instead of at the end. +LINKS_EACH_PARAGRAPH = 0 + +# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) +BODY_WIDTH = 0 + +# Don't show internal links (href="#local-anchor") -- corresponding link targets +# won't be visible in the plain text file anyway. +SKIP_INTERNAL_LINKS = True + +### Entity Nonsense ### + +def name2cp(k): + if k == 'apos': return ord("'") + if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 + return htmlentitydefs.name2codepoint[k] + else: + k = htmlentitydefs.entitydefs[k] + if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 + return ord(codecs.latin_1_decode(k)[0]) + +unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', +'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', +'ndash':'-', 'oelig':'oe', 'aelig':'ae', +'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', +'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', +'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', +'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', +'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} + +unifiable_n = {} + +for k in unifiable.keys(): + unifiable_n[name2cp(k)] = unifiable[k] + +def charref(name): + if name[0] in ['x','X']: + c = int(name[1:], 16) + else: + c = int(name) + + if not UNICODE_SNOB and c in unifiable_n.keys(): + return unifiable_n[c] + else: + return unichr(c) + +def entityref(c): + if not UNICODE_SNOB and c in unifiable.keys(): + return unifiable[c] + else: + try: name2cp(c) + except KeyError: return "&" + c + else: return unichr(name2cp(c)) + +def replaceEntities(s): + s = s.group(1) + if s[0] == "#": + return charref(s[1:]) + else: return entityref(s) + +r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") +def unescape(s): + return r_unescape.sub(replaceEntities, s) + +def fixattrs(attrs): + # Fix bug in sgmllib.py + if not attrs: return attrs + newattrs = [] + for attr in attrs: + newattrs.append((attr[0], unescape(attr[1]))) + return newattrs + +### End Entity Nonsense ### + +def onlywhite(line): + """Return true if the line does only consist of whitespace characters.""" + for c in line: + if c is not ' ' and c is not ' ': + return c is ' ' + return line + +def optwrap(text): + """Wrap all paragraphs in the provided text.""" + if not BODY_WIDTH: + return text + + assert wrap, "Requires Python 2.3." + result = '' + newlines = 0 + for para in text.split("\n"): + if len(para) > 0: + if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': + for line in wrap(para, BODY_WIDTH): + result += line + "\n" + result += "\n" + newlines = 2 + else: + if not onlywhite(para): + result += para + "\n" + newlines = 1 + else: + if newlines < 2: + result += "\n" + newlines += 1 + return result + +def hn(tag): + if tag[0] == 'h' and len(tag) == 2: + try: + n = int(tag[1]) + if n in range(1, 10): return n + except ValueError: return 0 + +class _html2text(sgmllib.SGMLParser): + def __init__(self, out=None, baseurl=''): + sgmllib.SGMLParser.__init__(self) + + if out is None: self.out = self.outtextf + else: self.out = out + self.outtext = u'' + self.quiet = 0 + self.p_p = 0 + self.outcount = 0 + self.start = 1 + self.space = 0 + self.a = [] + self.astack = [] + self.acount = 0 + self.list = [] + self.blockquote = 0 + self.pre = 0 + self.startpre = 0 + self.lastWasNL = 0 + self.abbr_title = None # current abbreviation definition + self.abbr_data = None # last inner HTML (for abbr being defined) + self.abbr_list = {} # stack of abbreviations to write later + self.baseurl = baseurl + + def outtextf(self, s): + self.outtext += s + + def close(self): + sgmllib.SGMLParser.close(self) + + self.pbr() + self.o('', 0, 'end') + + return self.outtext + + def handle_charref(self, c): + self.o(charref(c)) + + def handle_entityref(self, c): + self.o(entityref(c)) + + def unknown_starttag(self, tag, attrs): + self.handle_tag(tag, attrs, 1) + + def unknown_endtag(self, tag): + self.handle_tag(tag, None, 0) + + def previousIndex(self, attrs): + """ returns the index of certain set of attributes (of a link) in the + self.a list + + If the set of attributes is not found, returns None + """ + if not attrs.has_key('href'): return None + + i = -1 + for a in self.a: + i += 1 + match = 0 + + if a.has_key('href') and a['href'] == attrs['href']: + if a.has_key('title') or attrs.has_key('title'): + if (a.has_key('title') and attrs.has_key('title') and + a['title'] == attrs['title']): + match = True + else: + match = True + + if match: return i + + def handle_tag(self, tag, attrs, start): + attrs = fixattrs(attrs) + + if hn(tag): + self.p() + if start: self.o(hn(tag)*"#" + ' ') + + if tag in ['p', 'div']: self.p() + + if tag == "br" and start: self.o(" \n") + + if tag == "hr" and start: + self.p() + self.o("* * *") + self.p() + + if tag in ["head", "style", 'script']: + if start: self.quiet += 1 + else: self.quiet -= 1 + + if tag in ["body"]: + self.quiet = 0 # sites like 9rules.com never close + + if tag == "blockquote": + if start: + self.p(); self.o('> ', 0, 1); self.start = 1 + self.blockquote += 1 + else: + self.blockquote -= 1 + self.p() + + if tag in ['em', 'i', 'u']: self.o("_") + if tag in ['strong', 'b']: self.o("**") + if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` + if tag == "abbr": + if start: + attrsD = {} + for (x, y) in attrs: attrsD[x] = y + attrs = attrsD + + self.abbr_title = None + self.abbr_data = '' + if attrs.has_key('title'): + self.abbr_title = attrs['title'] + else: + if self.abbr_title != None: + self.abbr_list[self.abbr_data] = self.abbr_title + self.abbr_title = None + self.abbr_data = '' + + if tag == "a": + if start: + attrsD = {} + for (x, y) in attrs: attrsD[x] = y + attrs = attrsD + if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): + self.astack.append(attrs) + self.o("[") + else: + self.astack.append(None) + else: + if self.astack: + a = self.astack.pop() + if a: + i = self.previousIndex(a) + if i is not None: + a = self.a[i] + else: + self.acount += 1 + a['count'] = self.acount + a['outcount'] = self.outcount + self.a.append(a) + self.o("][" + `a['count']` + "]") + + if tag == "img" and start: + attrsD = {} + for (x, y) in attrs: attrsD[x] = y + attrs = attrsD + if attrs.has_key('src'): + attrs['href'] = attrs['src'] + alt = attrs.get('alt', '') + i = self.previousIndex(attrs) + if i is not None: + attrs = self.a[i] + else: + self.acount += 1 + attrs['count'] = self.acount + attrs['outcount'] = self.outcount + self.a.append(attrs) + self.o("![") + self.o(alt) + self.o("]["+`attrs['count']`+"]") + + if tag == 'dl' and start: self.p() + if tag == 'dt' and not start: self.pbr() + if tag == 'dd' and start: self.o(' ') + if tag == 'dd' and not start: self.pbr() + + if tag in ["ol", "ul"]: + if start: + self.list.append({'name':tag, 'num':0}) + else: + if self.list: self.list.pop() + + self.p() + + if tag == 'li': + if start: + self.pbr() + if self.list: li = self.list[-1] + else: li = {'name':'ul', 'num':0} + self.o(" "*len(self.list)) #TODO: line up
    1. s > 9 correctly. + if li['name'] == "ul": self.o("* ") + elif li['name'] == "ol": + li['num'] += 1 + self.o(`li['num']`+". ") + self.start = 1 + else: + self.pbr() + + if tag in ["table", "tr"] and start: self.p() + if tag == 'td': self.pbr() + + if tag == "pre": + if start: + self.startpre = 1 + self.pre = 1 + else: + self.pre = 0 + self.p() + + def pbr(self): + if self.p_p == 0: self.p_p = 1 + + def p(self): self.p_p = 2 + + def o(self, data, puredata=0, force=0): + if self.abbr_data is not None: self.abbr_data += data + + if not self.quiet: + if puredata and not self.pre: + data = re.sub('\s+', ' ', data) + if data and data[0] == ' ': + self.space = 1 + data = data[1:] + if not data and not force: return + + if self.startpre: + #self.out(" :") #TODO: not output when already one there + self.startpre = 0 + + bq = (">" * self.blockquote) + if not (force and data and data[0] == ">") and self.blockquote: bq += " " + + if self.pre: + bq += " " + data = data.replace("\n", "\n"+bq) + + if self.start: + self.space = 0 + self.p_p = 0 + self.start = 0 + + if force == 'end': + # It's the end. + self.p_p = 0 + self.out("\n") + self.space = 0 + + + if self.p_p: + self.out(('\n'+bq)*self.p_p) + self.space = 0 + + if self.space: + if not self.lastWasNL: self.out(' ') + self.space = 0 + + if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): + if force == "end": self.out("\n") + + newa = [] + for link in self.a: + if self.outcount > link['outcount']: + self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href'])) + if link.has_key('title'): self.out(" ("+link['title']+")") + self.out("\n") + else: + newa.append(link) + + if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. + + self.a = newa + + if self.abbr_list and force == "end": + for abbr, definition in self.abbr_list.items(): + self.out(" *[" + abbr + "]: " + definition + "\n") + + self.p_p = 0 + self.out(data) + self.lastWasNL = data and data[-1] == '\n' + self.outcount += 1 + + def handle_data(self, data): + if r'\/script>' in data: self.quiet -= 1 + self.o(data, 1) + + def unknown_decl(self, data): pass + +def wrapwrite(text): sys.stdout.write(text.encode('utf8')) + +def html2text_file(html, out=wrapwrite, baseurl=''): + h = _html2text(out, baseurl) + h.feed(html) + h.feed("") + return h.close() + +def html2text(html, baseurl=''): + return optwrap(html2text_file(html, None, baseurl)) + +if __name__ == "__main__": + baseurl = '' + if sys.argv[1:]: + arg = sys.argv[1] + if arg.startswith('http://') or arg.startswith('https://'): + baseurl = arg + j = urllib.urlopen(baseurl) + try: + from feedparser import _getCharacterEncoding as enc + enc + except ImportError: + enc = lambda x, y: ('utf-8', 1) + text = j.read() + encoding = enc(j.headers, text)[0] + if encoding == 'us-ascii': encoding = 'utf-8' + data = text.decode(encoding) + + else: + encoding = 'utf8' + if len(sys.argv) > 2: + encoding = sys.argv[2] + data = open(arg, 'r').read().decode(encoding) + else: + data = sys.stdin.read().decode('utf8') + wrapwrite(html2text(data, baseurl)) +