diff --git a/resources/content_server/browse/browse.css b/resources/content_server/browse/browse.css
index 92ed4c3ce6..1243795e55 100644
--- a/resources/content_server/browse/browse.css
+++ b/resources/content_server/browse/browse.css
@@ -208,6 +208,8 @@ h2.library_name {
}
+.toplevel li a { text-decoration: none; }
+
.toplevel li img {
vertical-align: middle;
margin-right: 1em;
@@ -261,9 +263,16 @@ h2.library_name {
}
-.category div.category-item span.href { display: none }
+.category div.category-item a { text-decoration: none; color: inherit; }
-#groups span.load_href { display: none }
+#groups a.load_href {
+ text-decoration: none;
+ color: inherit;
+ font-size: medium;
+ font-weight: normal;
+ padding: 0;
+ padding-left: 0.5em;
+}
#groups h3 {
font-weight: bold;
diff --git a/resources/content_server/browse/browse.js b/resources/content_server/browse/browse.js
index 89ce679871..e0585a9afd 100644
--- a/resources/content_server/browse/browse.js
+++ b/resources/content_server/browse/browse.js
@@ -116,7 +116,7 @@ function toplevel() {
$(".sort_select").hide();
$(".toplevel li").click(function() {
- var href = $(this).children("span.url").text();
+ var href = $(this).children("a").attr('href');
window.location = href;
});
@@ -133,7 +133,7 @@ function render_error(msg) {
// Category feed {{{
function category_clicked() {
- var href = $(this).find("span.href").html();
+ var href = $(this).find("a").attr('href');
window.location = href;
}
@@ -151,7 +151,7 @@ function category() {
change: function(event, ui) {
if (ui.newContent) {
- var href = ui.newContent.children("span.load_href").html();
+ var href = ui.newContent.prev().children("a.load_href").attr('href');
ui.newContent.children(".loading").show();
if (href) {
$.ajax({
diff --git a/resources/content_server/monocle.js b/resources/content_server/read/monocle.js
similarity index 100%
rename from resources/content_server/monocle.js
rename to resources/content_server/read/monocle.js
diff --git a/resources/recipes/clic_rbs.recipe b/resources/recipes/clic_rbs.recipe
new file mode 100644
index 0000000000..559dfa2000
--- /dev/null
+++ b/resources/recipes/clic_rbs.recipe
@@ -0,0 +1,50 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ClicRBS(BasicNewsRecipe):
+ title = u'ClicRBS'
+ language = 'pt'
+ __author__ = 'arvoredo'
+ oldest_article = 3
+ max_articles_per_feed = 9
+ cover_url = 'http://www.publicidade.clicrbs.com.br/clicrbs/imgs/logo_clic.gif'
+
+ remove_tags = [
+ dict(name='div', attrs={'class':['clic-barra-inner', 'botao-versao-mobile ']})
+ ]
+
+ remove_tags_before = dict(name='div ', attrs={'class':'descricao'})
+ remove_tags_before = dict(name='div', attrs={'id':'glb-corpo'})
+ remove_tags_before = dict(name='div', attrs={'class':'descricao'})
+ remove_tags_before = dict(name='div', attrs={'class':'coluna'})
+ remove_tags_after = dict(name='div', attrs={'class':'extra'})
+ remove_tags_after = dict(name='div', attrs={'id':'links-patrocinados'})
+ remove_tags_after = dict(name='h4', attrs={'class':'tipo-c comente'})
+ remove_tags_after = dict(name='ul', attrs={'class':'lista'})
+
+ feeds = [
+ (u'zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?uf=1&local=1&channel=13')
+ , (u'diariocatarinense.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?uf=2&local=18&channel=67')
+ , (u'Concursos e Emprego', u'http://g1.globo.com/Rss2/0,,AS0-9654,00.xml')
+ , (u'Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?channel=87&uf=1&local=1')
+ , (u'Economia, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=801&uf=1&local=1&channel=13')
+ , (u'Esportes, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=802&uf=1&local=1&channel=13')
+ , (u'Economia, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1180&channel=87&uf=1&local=1')
+ , (u'Política, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1185&channel=87&uf=1&local=1')
+ , (u'Mundo, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1184&channel=87&uf=1&local=1')
+ , (u'Catarinense, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=&theme=371&uf=2&channel=2')
+ , (u'Geral, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1183&channel=87&uf=1&local=1')
+ , (u'Estilo de Vida, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=805&uf=1&local=1&channel=13')
+ , (u'Corrida, Corrida, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1313&theme=15704&uf=1&channel=2')
+ , (u'Jornal de Santa Catarina, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?espid=159&uf=2&local=18')
+ , (u'Grêmio, Futebol, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=11&theme=65&uf=1&channel=2')
+ , (u'Velocidade, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1314&theme=2655&uf=1&channel=2')
+ ]
+
+ extra_css = '''
+ cite{color:#007BB5; font-size:xx-small; font-style:italic;}
+ body{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
+ h3{font-size:large; color:#082963; font-weight:bold;}
+ #ident{color:#0179B4; font-size:xx-small;}
+ p{color:#000000;font-weight:normal;}
+ .commentario p{color:#007BB5; font-style:italic;}
+ '''
diff --git a/resources/recipes/cm_journal.recipe b/resources/recipes/cm_journal.recipe
new file mode 100644
index 0000000000..c47fb35775
--- /dev/null
+++ b/resources/recipes/cm_journal.recipe
@@ -0,0 +1,44 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class CMJornal_pt(BasicNewsRecipe):
+ title = 'Correio da Manha - Portugal'
+ __author__ = 'jmst'
+ description = 'As noticias de Portugal e do Mundo'
+ publisher = 'Cofina Media'
+ category = ''
+ oldest_article = 1
+ no_stylesheets = True
+ encoding = 'utf-8'
+ use_embedded_content = False
+ language = 'pt'
+ extra_css = ' .publish{font-style: italic; line-height: 1.2em; border-bottom: 1px dotted; padding: 5px 0} .entity{line-height: 1.2em} .overview{line-height:1.2em} '
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
+ }
+
+ keep_only_tags = [
+ dict(name=['h2','h1'])
+ , dict(name='div', attrs={'class': ['news']})
+ ]
+
+ remove_tags = [
+ dict(name=['object','embed','iframe'])
+ ,dict(name='a',attrs={'href':['#']})
+ ]
+
+ feeds = [
+ (u'Actualidade' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000009-0000-0000-0000-000000000009' )
+ ,(u'Portugal' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000010-0000-0000-0000-000000000010' )
+ ,(u'Economia' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000011-0000-0000-0000-000000000011' )
+ ,(u'Mundo' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000091-0000-0000-0000-000000000091' )
+ ,(u'Desporto' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000012-0000-0000-0000-000000000012' )
+ ,(u'TV & Media', u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000092-0000-0000-0000-000000000092')
+ ]
+
+ def print_version(self, url):
+ return url.replace('noticia.aspx', 'Imprimir.aspx')
+
diff --git a/resources/recipes/el_faro.recipe b/resources/recipes/el_faro.recipe
new file mode 100644
index 0000000000..ec1b74b5cb
--- /dev/null
+++ b/resources/recipes/el_faro.recipe
@@ -0,0 +1,77 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ElFaroDeVigo(BasicNewsRecipe):
+ title = u'El Faro de Vigo'
+ oldest_article = 1
+ max_articles_per_feed = 100
+ __author__ = 'Jefferson Frantz'
+ description = 'Noticias de Vigo'
+ timefmt = ' [%d %b, %Y]'
+ language = 'es'
+ encoding = 'cp1252'
+ no_stylesheets = True
+ remove_javascript = True
+
+ feeds = [
+## (u'Vigo', u'http://www.farodevigo.es/elementosInt/rss/1'),
+## (u'Gran Vigo', u'http://www.farodevigo.es/elementosInt/rss/2'),
+ (u'Galicia', u'http://www.farodevigo.es/elementosInt/rss/4'),
+ (u'España', u'http://www.farodevigo.es/elementosInt/rss/6'),
+ (u'Mundo', u'http://www.farodevigo.es/elementosInt/rss/7'),
+## (u'Opinión', u'http://www.farodevigo.es/elementosInt/rss/5'),
+ (u'Economía', u'http://www.farodevigo.es/elementosInt/rss/10'),
+ (u'Sociedad y Cultura', u'http://www.farodevigo.es/elementosInt/rss/8'),
+ (u'Sucesos', u'http://www.farodevigo.es/elementosInt/rss/9'),
+ (u'Deportes', u'http://www.farodevigo.es/elementosInt/rss/11'),
+ (u'Agenda', u'http://www.farodevigo.es/elementosInt/rss/21'),
+ (u'Gente', u'http://www.farodevigo.es/elementosInt/rss/24'),
+ (u'Televisión', u'http://www.farodevigo.es/elementosInt/rss/25'),
+ (u'Ciencia y Tecnología', u'http://www.farodevigo.es/elementosInt/rss/26')]
+
+ extra_css = '''.noticia_texto{ font-family: sans-serif; font-size: medium; text-align: justify }
+ h1{font-family: serif; font-size: x-large; font-weight: bold; color: #000000; text-align: center}
+ h2{font-family: serif; font-size: medium; font-weight: bold; color: #000000; text-align: left}
+ .enlacenegrita10{font-family: serif; font-size: small; font-weight: bold; color: #000000; text-align: left}
+ .noticia_titular{font-family: serif; font-size: x-large; font-weight: bold; color: #000000; text-align: center}'''
+
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+
+ url = 'http://estaticos00.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif'
+ fitem = soup.find('img',src=url)
+ if fitem:
+ par = fitem.parent
+ par.extract()
+ url = 'http://estaticos01.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif'
+ fitem = soup.find('img',src=url)
+ if fitem:
+ par = fitem.parent
+ par.extract()
+ url = 'http://estaticos02.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif'
+ fitem = soup.find('img',src=url)
+ if fitem:
+ par = fitem.parent
+ par.extract()
+
+ return self.adeify_images(soup)
+
+ def postprocess_html(self, soup, first_fetch):
+ divs = soup.findAll(True, {'class':'enlacenegrita10'})
+ for div in divs:
+ div['align'] = 'left'
+
+ return soup
+
+
+ keep_only_tags = [dict(name='div', attrs={'class':['noticias']})]
+
+ remove_tags = [
+ dict(name=['object','link','script','ul','iframe','ol'])
+ ,dict(name='div', attrs={'class':['noticiadd2', 'cintillo2', 'noticiadd', 'noticiadd2']})
+ ,dict(name='div', attrs={'class':['imagen_derecha', 'noticiadd3', 'extraHTML']})
+
+ ]
+
+
diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe
index 6a61405698..162a3c774e 100644
--- a/resources/recipes/ming_pao.recipe
+++ b/resources/recipes/ming_pao.recipe
@@ -1,7 +1,9 @@
-cense__ = 'GPL v3'
+__license__ = 'GPL v3'
__copyright__ = '2010, Eddie Lau'
'''
modified from Singtao Toronto calibre recipe by rty
+Change Log:
+2010/10/31: skip repeated articles in section pages
'''
import datetime
@@ -23,42 +25,37 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
recursions = 0
conversion_options = {'linearize_tables':True}
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
-
keep_only_tags = [dict(name='h1'),
dict(attrs={'id':['newscontent01','newscontent02']})]
def get_fetchdate(self):
dt_utc = datetime.datetime.utcnow()
- # convert UTC to local hk time
- dt_local = dt_utc - datetime.timedelta(-8.0/24)
+ # convert UTC to local hk time - at around HKT 5.30am, all news are available
+ dt_local = dt_utc - datetime.timedelta(-2.5/24)
return dt_local.strftime("%Y%m%d")
def parse_index(self):
- feeds = []
- dateStr = self.get_fetchdate()
- for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]:
- articles = self.parse_section(url)
- if articles:
- feeds.append((title, articles))
+ feeds = []
+ dateStr = self.get_fetchdate()
+ for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]:
+ articles = self.parse_section(url)
+ if articles:
+ feeds.append((title, articles))
return feeds
def parse_section(self, url):
- dateStr = self.get_fetchdate()
- soup = self.index_to_soup(url)
- divs = soup.findAll(attrs={'class': ['bullet']})
- current_articles = []
- for i in divs:
- a = i.find('a', href = True)
- title = self.tag_to_string(a)
- url = a.get('href', False)
- url = 'http://news.mingpao.com/' + dateStr + '/' +url
+ dateStr = self.get_fetchdate()
+ soup = self.index_to_soup(url)
+ divs = soup.findAll(attrs={'class': ['bullet']})
+ current_articles = []
+ included_urls = []
+ for i in divs:
+ a = i.find('a', href = True)
+ title = self.tag_to_string(a)
+ url = a.get('href', False)
+ url = 'http://news.mingpao.com/' + dateStr + '/' +url
+ if url not in included_urls:
current_articles.append({'title': title, 'url': url, 'description':''})
- return current_articles
-
- def preprocess_html(self, soup):
- for item in soup.findAll(style=True):
- del item['style']
- for item in soup.findAll(width=True):
- del item['width']
- return soup
+ included_urls.append(url)
+ return current_articles
diff --git a/setup/installer/windows/notes.rst b/setup/installer/windows/notes.rst
index a8ba41e8ff..545070f7ff 100644
--- a/setup/installer/windows/notes.rst
+++ b/setup/installer/windows/notes.rst
@@ -28,7 +28,9 @@ If there are no windows binaries already compiled for the version of python you
Run the following command to install python dependencies::
- easy_install --always-unzip -U ipython mechanize BeautifulSoup pyreadline python-dateutil dnspython
+ easy_install --always-unzip -U ipython mechanize pyreadline python-dateutil dnspython
+
+Install BeautifulSoup 3.0.x manually into site-packages (3.1.x parses broken HTML very poorly)
Qt
--------
diff --git a/setup/server.py b/setup/server.py
index 2103f4805a..0fea4ec733 100644
--- a/setup/server.py
+++ b/setup/server.py
@@ -5,10 +5,46 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import subprocess, tempfile, os, time
+import subprocess, tempfile, os, time, sys, telnetlib
+from threading import RLock
from setup import Command
+try:
+ from pyinotify import WatchManager, ThreadedNotifier, EventsCodes, ProcessEvent
+except:
+ wm = None
+else:
+ wm = WatchManager()
+ flags = EventsCodes.ALL_FLAGS
+ mask = flags['IN_MODIFY']
+
+ class ProcessEvents(ProcessEvent):
+
+ def __init__(self, command):
+ ProcessEvent.__init__(self)
+ self.command = command
+
+ def process_default(self, event):
+ name = getattr(event,
+ 'name', None)
+ if not name:
+ return
+ ext = os.path.splitext(name)[1]
+ reload = False
+ if ext == '.py':
+ reload = True
+ print
+ print name, 'changed'
+ self.command.kill_server()
+ self.command.launch_server()
+ print self.command.prompt,
+ sys.stdout.flush()
+
+ if reload:
+ self.command.reload_browser(delay=1)
+
+
class Server(Command):
description = 'Run the calibre server in development mode conveniently'
@@ -18,31 +54,75 @@ class Server(Command):
def rebuild_monocole(self):
subprocess.check_call(['sprocketize', '-C', self.MONOCLE_PATH,
'-I', 'src', 'src/monocle.js'],
- stdout=open('resources/content_server/monocle.js', 'wb'))
+ stdout=open('resources/content_server/read/monocle.js', 'wb'))
- def launch_server(self, log):
- self.rebuild_monocole()
- p = subprocess.Popen(['calibre-server', '--develop'],
- stderr=subprocess.STDOUT, stdout=log)
- return p
+ def launch_server(self):
+ print 'Starting server...\n'
+ with self.lock:
+ self.rebuild_monocole()
+ self.server_proc = p = subprocess.Popen(['calibre-server', '--develop'],
+ stderr=subprocess.STDOUT, stdout=self.server_log)
+ time.sleep(0.2)
+ if p.poll() is not None:
+ print 'Starting server failed'
+ raise SystemExit(1)
+ return p
+
+ def kill_server(self):
+ print 'Killing server...\n'
+ if self.server_proc is not None:
+ with self.lock:
+ if self.server_proc.poll() is None:
+ self.server_proc.terminate()
+ while self.server_proc.poll() is None:
+ time.sleep(0.1)
+
+ def watch(self):
+ if wm is not None:
+ self.notifier = ThreadedNotifier(wm, ProcessEvents(self))
+ self.notifier.start()
+ self.wdd = wm.add_watch(os.path.abspath('src'), mask, rec=True)
+
+ def reload_browser(self, delay=0.1):
+ time.sleep(delay)
+ try:
+ t = telnetlib.Telnet('localhost', 4242)
+ t.read_until("repl>")
+ t.write('BrowserReload();')
+ print t.read_until("repl>")
+ t.close()
+ except:
+ print 'Failed to reload browser'
+ import traceback
+ traceback.print_exc()
def run(self, opts):
+ self.lock = RLock()
tdir = tempfile.gettempdir()
logf = os.path.join(tdir, 'calibre-server.log')
- log = open(logf, 'ab')
+ self.server_log = open(logf, 'ab')
+ self.prompt = 'Press Enter to kill/restart server. Ctrl+C to quit: '
print 'Server log available at:', logf
+ print
+ self.watch()
+ first = True
while True:
- print 'Starting server...'
- p = self.launch_server(log)
+ self.launch_server()
+ if not first:
+ self.reload_browser()
+ first = False
+
try:
- raw_input('Press Enter to kill/restart server. Ctrl+C to quit: ')
+ raw_input(self.prompt)
except:
+ print
+ self.kill_server()
break
else:
- while p.returncode is None:
- p.terminate()
- time.sleep(0.1)
- p.kill()
+ self.kill_server()
print
+ if hasattr(self, 'notifier'):
+ self.notifier.stop()
+
diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 2945cc6604..3cc84f248d 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -475,7 +475,7 @@ from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \
SOVOS, PICO
from calibre.devices.sne.driver import SNE
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \
- GEMEI, VELOCITYMICRO, PDNOVEL_KOBO
+ GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO
@@ -586,6 +586,7 @@ plugins += [
AVANT,
MENTOR,
SWEEX,
+ Q600,
KOGAN,
PDNOVEL,
SPECTRA,
diff --git a/src/calibre/devices/apple/driver.py b/src/calibre/devices/apple/driver.py
index 9ad3cf3e08..74fa868255 100644
--- a/src/calibre/devices/apple/driver.py
+++ b/src/calibre/devices/apple/driver.py
@@ -19,7 +19,7 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.epub import set_metadata
from calibre.library.server.utils import strftime
from calibre.utils.config import config_dir, prefs
-from calibre.utils.date import isoformat, now, parse_date
+from calibre.utils.date import now, parse_date
from calibre.utils.logging import Log
from calibre.utils.zipfile import ZipFile
@@ -2521,11 +2521,11 @@ class ITUNES(DriverBase):
metadata.timestamp = datetime.datetime(old_ts.year, old_ts.month, old_ts.day, old_ts.hour,
old_ts.minute, old_ts.second, old_ts.microsecond+1, old_ts.tzinfo)
else:
- metadata.timestamp = isoformat(now())
+ metadata.timestamp = now()
if DEBUG:
self.log.info(" add timestamp: %s" % metadata.timestamp)
else:
- metadata.timestamp = isoformat(now())
+ metadata.timestamp = now()
if DEBUG:
self.log.warning(" missing block in OPF file")
self.log.info(" add timestamp: %s" % metadata.timestamp)
diff --git a/src/calibre/devices/misc.py b/src/calibre/devices/misc.py
index 92e26d47e4..af5a77ce03 100644
--- a/src/calibre/devices/misc.py
+++ b/src/calibre/devices/misc.py
@@ -72,6 +72,15 @@ class SWEEX(USBMS):
EBOOK_DIR_MAIN = ''
SUPPORTS_SUB_DIRS = True
+class Q600(SWEEX):
+
+ name = 'Digma Q600 Device interface'
+ gui_name = 'Q600'
+ description = _('Communicate with the Digma Q600')
+
+ BCD = [0x325]
+ FORMATS = ['epub', 'fb2', 'mobi', 'prc', 'html', 'rtf', 'chm', 'pdf', 'txt']
+
class KOGAN(SWEEX):
name = 'Kogan Device Interface'
diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py
index a8ff0f1ad0..81d996c6a7 100644
--- a/src/calibre/ebooks/metadata/amazon.py
+++ b/src/calibre/ebooks/metadata/amazon.py
@@ -9,11 +9,13 @@ Fetch metadata using Amazon AWS
import sys, re
from lxml import html
+from lxml.html import soupparser
from calibre import browser
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode
+from calibre.library.comments import sanitize_comments_html
def find_asin(br, isbn):
q = 'http://www.amazon.com/s?field-keywords='+isbn
@@ -70,7 +72,10 @@ def get_metadata(br, asin, mi):
return False
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
- root = html.fromstring(raw)
+ try:
+ root = soupparser.fromstring(raw)
+ except:
+ return False
ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]')
if ratings:
pat = re.compile(r'([0-9.]+) out of (\d+) stars')
@@ -95,25 +100,26 @@ def get_metadata(br, asin, mi):
# remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Collapse whitespace
- desc = re.sub('\n+', '\n', desc)
- desc = re.sub(' +', ' ', desc)
+ #desc = re.sub('\n+', '\n', desc)
+ #desc = re.sub(' +', ' ', desc)
# Remove the notice about text referring to out of print editions
desc = re.sub(r'(?s)--This text ref.*?', '', desc)
# Remove comments
desc = re.sub(r'(?s)', '', desc)
- mi.comments = desc
+ mi.comments = sanitize_comments_html(desc)
return True
def main(args=sys.argv):
# Test xisbn
- print get_social_metadata('Learning Python', None, None, '8324616489')
- print
+ #print get_social_metadata('Learning Python', None, None, '8324616489')
+ #print
# Test sophisticated comment formatting
- print get_social_metadata('Swan Thieves', None, None, '9780316065795')
+ print get_social_metadata('Angels & Demons', None, None, '9781416580829')
print
+ return
# Random tests
print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')
diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py
index 78585d13b6..4e93335af6 100644
--- a/src/calibre/gui2/device.py
+++ b/src/calibre/gui2/device.py
@@ -484,17 +484,22 @@ class DeviceMenu(QMenu): # {{{
_('Storage Card B')),
]
+ later_menus = []
for menu in (self, self.set_default_menu):
for actions, desc in (
(basic_actions, ''),
+ (specific_actions, _('Send specific format to')),
(delete_actions, _('Send and delete from library')),
- (specific_actions, _('Send specific format to'))
):
mdest = menu
if actions is not basic_actions:
- mdest = menu.addMenu(desc)
+ mdest = QMenu(desc)
self._memory.append(mdest)
+ later_menus.append(mdest)
+ if menu is self.set_default_menu:
+ menu.addMenu(mdest)
+ menu.addSeparator()
for dest, delete, specific, icon, text in actions:
action = DeviceAction(dest, delete, specific, icon, text, self)
@@ -507,7 +512,7 @@ class DeviceMenu(QMenu): # {{{
action.a_s.connect(self.action_triggered)
self.actions.append(action)
mdest.addAction(action)
- if actions is not specific_actions:
+ if actions is basic_actions:
menu.addSeparator()
da = config['default_send_to_device_action']
@@ -525,14 +530,21 @@ class DeviceMenu(QMenu): # {{{
self.group.triggered.connect(self.change_default_action)
self.addSeparator()
+ self.addMenu(later_menus[0])
+ self.addSeparator()
+
mitem = self.addAction(QIcon(I('eject.png')), _('Eject device'))
mitem.setEnabled(False)
mitem.triggered.connect(lambda x : self.disconnect_mounted_device.emit())
self.disconnect_mounted_device_action = mitem
-
self.addSeparator()
+
self.addMenu(self.set_default_menu)
self.addSeparator()
+
+ self.addMenu(later_menus[1])
+ self.addSeparator()
+
annot = self.addAction(_('Fetch annotations (experimental)'))
annot.setEnabled(False)
annot.triggered.connect(lambda x :
diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py
index 670d9f2564..83eec89abe 100644
--- a/src/calibre/library/comments.py
+++ b/src/calibre/library/comments.py
@@ -11,11 +11,15 @@ from calibre.constants import preferred_encoding
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \
CData, Comment, Declaration, ProcessingInstruction
from calibre import prepare_string_for_xml
+from calibre.utils.html2text import html2text
+from calibre.ebooks.markdown import markdown
# Hackish - ignoring sentences ending or beginning in numbers to avoid
# confusion with decimal points.
lost_cr_pat = re.compile('([a-z])([\.\?!])([A-Z])')
lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])')
+sanitize_pat = re.compile(r'
'
+
+
# Explode lost CRs to \n\n
comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
'.\r'), comments)
@@ -115,6 +128,11 @@ def comments_to_html(comments):
return result.renderContents(encoding=None)
+def sanitize_comments_html(html):
+ text = html2text(html)
+ md = markdown.Markdown(safe_mode=True)
+ return md.convert(text)
+
def test():
for pat, val in [
('lineone\n\nlinetwo',
diff --git a/src/calibre/library/server/base.py b/src/calibre/library/server/base.py
index c9025a28f8..29636c5659 100644
--- a/src/calibre/library/server/base.py
+++ b/src/calibre/library/server/base.py
@@ -118,16 +118,17 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache,
self.set_database(db)
cherrypy.config.update({
- 'log.screen' : opts.develop,
- 'engine.autoreload_on' : opts.develop,
- 'tools.log_headers.on' : opts.develop,
- 'checker.on' : opts.develop,
- 'request.show_tracebacks': show_tracebacks,
- 'server.socket_host' : listen_on,
- 'server.socket_port' : opts.port,
- 'server.socket_timeout' : opts.timeout, #seconds
- 'server.thread_pool' : opts.thread_pool, # number of threads
- })
+ 'log.screen' : opts.develop,
+ 'engine.autoreload_on' : getattr(opts,
+ 'auto_reload', False),
+ 'tools.log_headers.on' : opts.develop,
+ 'checker.on' : opts.develop,
+ 'request.show_tracebacks': show_tracebacks,
+ 'server.socket_host' : listen_on,
+ 'server.socket_port' : opts.port,
+ 'server.socket_timeout' : opts.timeout, #seconds
+ 'server.thread_pool' : opts.thread_pool, # number of threads
+ })
if embedded or wsgi:
cherrypy.config.update({'engine.SIGHUP' : None,
'engine.SIGTERM' : None,})
diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py
index 9c442acc11..9530a34c73 100644
--- a/src/calibre/library/server/browse.py
+++ b/src/calibre/library/server/browse.py
@@ -123,9 +123,10 @@ def get_category_items(category, items, restriction, datatype, prefix): # {{{
def item(i):
templ = (u''
- '
{0}
{1}
'
- '
{2}'
- '{5}{3}
')
+ ''
+ '{1}
'
+ '{2}
')
rating, rstring = render_rating(i.avg_rating, prefix)
name = xml(i.name)
if datatype == 'rating':
@@ -142,7 +143,7 @@ def get_category_items(category, items, restriction, datatype, prefix): # {{{
q = category
href = '/browse/matches/%s/%s'%(quote(q), quote(id_))
return templ.format(xml(name), rating,
- xml(desc), xml(href), rstring, prefix)
+ xml(desc), xml(href, True), rstring, prefix)
items = list(map(item, items))
return '\n'.join([''] + items + ['
'])
@@ -252,8 +253,6 @@ class BrowseServer(object):
lp = self.db.library_path
if isbytestring(lp):
lp = force_unicode(lp, filesystem_encoding)
- if isinstance(ans, unicode):
- ans = ans.encode('utf-8')
ans = ans.replace('{library_name}', xml(os.path.basename(lp)))
ans = ans.replace('{library_path}', xml(lp, True))
ans = ans.replace('{initial_search}', initial_search)
@@ -335,9 +334,10 @@ class BrowseServer(object):
icon = 'blank.png'
cats.append((meta['name'], category, icon))
- cats = [('
'
+ cats = [(' '
+ '
'
'{0}'
- '{3}/browse/category/{1}')
+ '')
.format(xml(x, True), xml(quote(y)), xml(_('Browse books by')),
self.opts.url_prefix, src='/browse/icon/'+z)
for x, y, z in cats]
@@ -393,14 +393,15 @@ class BrowseServer(object):
for x in sorted(starts):
category_groups[x] = len([y for y in items if
getter(y).upper().startswith(x)])
- items = [(u'{0} [{2}]
'
+ items = [(u'
'
u'
'
u'
{1}'
- u'
{4}{3}').format(
+ u'
').format(
xml(s, True),
xml(_('Loading, please wait'))+'…',
unicode(c),
- xml(u'/browse/category_group/%s/%s'%(category, s)),
+ xml(u'/browse/category_group/%s/%s'%(category, s), True),
self.opts.url_prefix)
for s, c in category_groups.items()]
items = '\n\n'.join(items)
@@ -563,7 +564,8 @@ class BrowseServer(object):
if not val:
val = ''
args[key] = xml(val, True)
- fname = ascii_filename(args['title']) + ' - ' + ascii_filename(args['authors'])
+ fname = quote(ascii_filename(args['title']) + ' - ' +
+ ascii_filename(args['authors']))
return args, fmt, fmts, fname
@Endpoint(mimetype='application/json; charset=utf-8')
diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py
index 670c31b9df..6437f02cb6 100644
--- a/src/calibre/library/server/content.py
+++ b/src/calibre/library/server/content.py
@@ -70,10 +70,10 @@ class ContentServer(object):
id = id.rpartition('_')[-1].partition('.')[0]
match = re.search(r'\d+', id)
if not match:
- raise cherrypy.HTTPError(400, 'id:%s not an integer'%id)
+ raise cherrypy.HTTPError(404, 'id:%s not an integer'%id)
id = int(match.group())
if not self.db.has_id(id):
- raise cherrypy.HTTPError(400, 'id:%d does not exist in database'%id)
+ raise cherrypy.HTTPError(404, 'id:%d does not exist in database'%id)
if what == 'thumb' or what.startswith('thumb_'):
try:
width, height = map(int, what.split('_')[1:])
diff --git a/src/calibre/library/server/main.py b/src/calibre/library/server/main.py
index fbd811a1ab..b7cb3ecf12 100644
--- a/src/calibre/library/server/main.py
+++ b/src/calibre/library/server/main.py
@@ -58,6 +58,9 @@ The OPDS interface is advertised via BonJour automatically.
help=_('Specifies a restriction to be used for this invocation. '
'This option overrides any per-library settings specified'
' in the GUI'))
+ parser.add_option('--auto-reload', default=False, action='store_true',
+ help=_('Auto reload server when source code changes. May not'
+ ' work in all environments.'))
return parser
diff --git a/src/calibre/library/server/mobile.py b/src/calibre/library/server/mobile.py
index a889089109..d66e6d842f 100644
--- a/src/calibre/library/server/mobile.py
+++ b/src/calibre/library/server/mobile.py
@@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
import re, os
import __builtin__
+from urllib import quote
import cherrypy
from lxml import html
@@ -115,8 +116,8 @@ def build_index(books, num, search, sort, order, start, total, url_base, CKEYS,
data = TD()
for fmt in book['formats'].split(','):
- a = ascii_filename(book['authors'])
- t = ascii_filename(book['title'])
+ a = quote(ascii_filename(book['authors']))
+ t = quote(ascii_filename(book['title']))
s = SPAN(
A(
fmt.lower(),
diff --git a/src/calibre/utils/html2text.py b/src/calibre/utils/html2text.py
new file mode 100644
index 0000000000..0eb84a3d38
--- /dev/null
+++ b/src/calibre/utils/html2text.py
@@ -0,0 +1,452 @@
+#!/usr/bin/env python
+"""html2text: Turn HTML into equivalent Markdown-structured text."""
+__version__ = "2.39"
+__author__ = "Aaron Swartz (me@aaronsw.com)"
+__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
+__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
+
+# TODO:
+# Support decoded entities with unifiable.
+
+if not hasattr(__builtins__, 'True'): True, False = 1, 0
+import re, sys, urllib, htmlentitydefs, codecs
+import sgmllib
+import urlparse
+sgmllib.charref = re.compile('([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
+
+try: from textwrap import wrap
+except: pass
+
+# Use Unicode characters instead of their ascii psuedo-replacements
+UNICODE_SNOB = 1
+
+# Put the links after each paragraph instead of at the end.
+LINKS_EACH_PARAGRAPH = 0
+
+# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
+BODY_WIDTH = 0
+
+# Don't show internal links (href="#local-anchor") -- corresponding link targets
+# won't be visible in the plain text file anyway.
+SKIP_INTERNAL_LINKS = True
+
+### Entity Nonsense ###
+
+def name2cp(k):
+ if k == 'apos': return ord("'")
+ if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
+ return htmlentitydefs.name2codepoint[k]
+ else:
+ k = htmlentitydefs.entitydefs[k]
+ if k.startswith("") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
+ return ord(codecs.latin_1_decode(k)[0])
+
+unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
+'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
+'ndash':'-', 'oelig':'oe', 'aelig':'ae',
+'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
+'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
+'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
+'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
+'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
+
+unifiable_n = {}
+
+for k in unifiable.keys():
+ unifiable_n[name2cp(k)] = unifiable[k]
+
+def charref(name):
+ if name[0] in ['x','X']:
+ c = int(name[1:], 16)
+ else:
+ c = int(name)
+
+ if not UNICODE_SNOB and c in unifiable_n.keys():
+ return unifiable_n[c]
+ else:
+ return unichr(c)
+
+def entityref(c):
+ if not UNICODE_SNOB and c in unifiable.keys():
+ return unifiable[c]
+ else:
+ try: name2cp(c)
+ except KeyError: return "&" + c
+ else: return unichr(name2cp(c))
+
+def replaceEntities(s):
+ s = s.group(1)
+ if s[0] == "#":
+ return charref(s[1:])
+ else: return entityref(s)
+
+r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
+def unescape(s):
+ return r_unescape.sub(replaceEntities, s)
+
+def fixattrs(attrs):
+ # Fix bug in sgmllib.py
+ if not attrs: return attrs
+ newattrs = []
+ for attr in attrs:
+ newattrs.append((attr[0], unescape(attr[1])))
+ return newattrs
+
+### End Entity Nonsense ###
+
+def onlywhite(line):
+ """Return true if the line does only consist of whitespace characters."""
+ for c in line:
+ if c is not ' ' and c is not ' ':
+ return c is ' '
+ return line
+
+def optwrap(text):
+ """Wrap all paragraphs in the provided text."""
+ if not BODY_WIDTH:
+ return text
+
+ assert wrap, "Requires Python 2.3."
+ result = ''
+ newlines = 0
+ for para in text.split("\n"):
+ if len(para) > 0:
+ if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
+ for line in wrap(para, BODY_WIDTH):
+ result += line + "\n"
+ result += "\n"
+ newlines = 2
+ else:
+ if not onlywhite(para):
+ result += para + "\n"
+ newlines = 1
+ else:
+ if newlines < 2:
+ result += "\n"
+ newlines += 1
+ return result
+
+def hn(tag):
+ if tag[0] == 'h' and len(tag) == 2:
+ try:
+ n = int(tag[1])
+ if n in range(1, 10): return n
+ except ValueError: return 0
+
+class _html2text(sgmllib.SGMLParser):
+ def __init__(self, out=None, baseurl=''):
+ sgmllib.SGMLParser.__init__(self)
+
+ if out is None: self.out = self.outtextf
+ else: self.out = out
+ self.outtext = u''
+ self.quiet = 0
+ self.p_p = 0
+ self.outcount = 0
+ self.start = 1
+ self.space = 0
+ self.a = []
+ self.astack = []
+ self.acount = 0
+ self.list = []
+ self.blockquote = 0
+ self.pre = 0
+ self.startpre = 0
+ self.lastWasNL = 0
+ self.abbr_title = None # current abbreviation definition
+ self.abbr_data = None # last inner HTML (for abbr being defined)
+ self.abbr_list = {} # stack of abbreviations to write later
+ self.baseurl = baseurl
+
+ def outtextf(self, s):
+ self.outtext += s
+
+ def close(self):
+ sgmllib.SGMLParser.close(self)
+
+ self.pbr()
+ self.o('', 0, 'end')
+
+ return self.outtext
+
+ def handle_charref(self, c):
+ self.o(charref(c))
+
+ def handle_entityref(self, c):
+ self.o(entityref(c))
+
+ def unknown_starttag(self, tag, attrs):
+ self.handle_tag(tag, attrs, 1)
+
+ def unknown_endtag(self, tag):
+ self.handle_tag(tag, None, 0)
+
+ def previousIndex(self, attrs):
+ """ returns the index of certain set of attributes (of a link) in the
+ self.a list
+
+ If the set of attributes is not found, returns None
+ """
+ if not attrs.has_key('href'): return None
+
+ i = -1
+ for a in self.a:
+ i += 1
+ match = 0
+
+ if a.has_key('href') and a['href'] == attrs['href']:
+ if a.has_key('title') or attrs.has_key('title'):
+ if (a.has_key('title') and attrs.has_key('title') and
+ a['title'] == attrs['title']):
+ match = True
+ else:
+ match = True
+
+ if match: return i
+
+ def handle_tag(self, tag, attrs, start):
+ attrs = fixattrs(attrs)
+
+ if hn(tag):
+ self.p()
+ if start: self.o(hn(tag)*"#" + ' ')
+
+ if tag in ['p', 'div']: self.p()
+
+ if tag == "br" and start: self.o(" \n")
+
+ if tag == "hr" and start:
+ self.p()
+ self.o("* * *")
+ self.p()
+
+ if tag in ["head", "style", 'script']:
+ if start: self.quiet += 1
+ else: self.quiet -= 1
+
+ if tag in ["body"]:
+ self.quiet = 0 # sites like 9rules.com never close
+
+ if tag == "blockquote":
+ if start:
+ self.p(); self.o('> ', 0, 1); self.start = 1
+ self.blockquote += 1
+ else:
+ self.blockquote -= 1
+ self.p()
+
+ if tag in ['em', 'i', 'u']: self.o("_")
+ if tag in ['strong', 'b']: self.o("**")
+ if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
+ if tag == "abbr":
+ if start:
+ attrsD = {}
+ for (x, y) in attrs: attrsD[x] = y
+ attrs = attrsD
+
+ self.abbr_title = None
+ self.abbr_data = ''
+ if attrs.has_key('title'):
+ self.abbr_title = attrs['title']
+ else:
+ if self.abbr_title != None:
+ self.abbr_list[self.abbr_data] = self.abbr_title
+ self.abbr_title = None
+ self.abbr_data = ''
+
+ if tag == "a":
+ if start:
+ attrsD = {}
+ for (x, y) in attrs: attrsD[x] = y
+ attrs = attrsD
+ if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
+ self.astack.append(attrs)
+ self.o("[")
+ else:
+ self.astack.append(None)
+ else:
+ if self.astack:
+ a = self.astack.pop()
+ if a:
+ i = self.previousIndex(a)
+ if i is not None:
+ a = self.a[i]
+ else:
+ self.acount += 1
+ a['count'] = self.acount
+ a['outcount'] = self.outcount
+ self.a.append(a)
+ self.o("][" + `a['count']` + "]")
+
+ if tag == "img" and start:
+ attrsD = {}
+ for (x, y) in attrs: attrsD[x] = y
+ attrs = attrsD
+ if attrs.has_key('src'):
+ attrs['href'] = attrs['src']
+ alt = attrs.get('alt', '')
+ i = self.previousIndex(attrs)
+ if i is not None:
+ attrs = self.a[i]
+ else:
+ self.acount += 1
+ attrs['count'] = self.acount
+ attrs['outcount'] = self.outcount
+ self.a.append(attrs)
+ self.o("![")
+ self.o(alt)
+ self.o("]["+`attrs['count']`+"]")
+
+ if tag == 'dl' and start: self.p()
+ if tag == 'dt' and not start: self.pbr()
+ if tag == 'dd' and start: self.o(' ')
+ if tag == 'dd' and not start: self.pbr()
+
+ if tag in ["ol", "ul"]:
+ if start:
+ self.list.append({'name':tag, 'num':0})
+ else:
+ if self.list: self.list.pop()
+
+ self.p()
+
+ if tag == 'li':
+ if start:
+ self.pbr()
+ if self.list: li = self.list[-1]
+ else: li = {'name':'ul', 'num':0}
+ self.o(" "*len(self.list)) #TODO: line up - s > 9 correctly.
+ if li['name'] == "ul": self.o("* ")
+ elif li['name'] == "ol":
+ li['num'] += 1
+ self.o(`li['num']`+". ")
+ self.start = 1
+ else:
+ self.pbr()
+
+ if tag in ["table", "tr"] and start: self.p()
+ if tag == 'td': self.pbr()
+
+ if tag == "pre":
+ if start:
+ self.startpre = 1
+ self.pre = 1
+ else:
+ self.pre = 0
+ self.p()
+
+ def pbr(self):
+ if self.p_p == 0: self.p_p = 1
+
+ def p(self): self.p_p = 2
+
+ def o(self, data, puredata=0, force=0):
+ if self.abbr_data is not None: self.abbr_data += data
+
+ if not self.quiet:
+ if puredata and not self.pre:
+ data = re.sub('\s+', ' ', data)
+ if data and data[0] == ' ':
+ self.space = 1
+ data = data[1:]
+ if not data and not force: return
+
+ if self.startpre:
+ #self.out(" :") #TODO: not output when already one there
+ self.startpre = 0
+
+ bq = (">" * self.blockquote)
+ if not (force and data and data[0] == ">") and self.blockquote: bq += " "
+
+ if self.pre:
+ bq += " "
+ data = data.replace("\n", "\n"+bq)
+
+ if self.start:
+ self.space = 0
+ self.p_p = 0
+ self.start = 0
+
+ if force == 'end':
+ # It's the end.
+ self.p_p = 0
+ self.out("\n")
+ self.space = 0
+
+
+ if self.p_p:
+ self.out(('\n'+bq)*self.p_p)
+ self.space = 0
+
+ if self.space:
+ if not self.lastWasNL: self.out(' ')
+ self.space = 0
+
+ if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
+ if force == "end": self.out("\n")
+
+ newa = []
+ for link in self.a:
+ if self.outcount > link['outcount']:
+ self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
+ if link.has_key('title'): self.out(" ("+link['title']+")")
+ self.out("\n")
+ else:
+ newa.append(link)
+
+ if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
+
+ self.a = newa
+
+ if self.abbr_list and force == "end":
+ for abbr, definition in self.abbr_list.items():
+ self.out(" *[" + abbr + "]: " + definition + "\n")
+
+ self.p_p = 0
+ self.out(data)
+ self.lastWasNL = data and data[-1] == '\n'
+ self.outcount += 1
+
+ def handle_data(self, data):
+ if r'\/script>' in data: self.quiet -= 1
+ self.o(data, 1)
+
+ def unknown_decl(self, data): pass
+
+def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
+
+def html2text_file(html, out=wrapwrite, baseurl=''):
+ h = _html2text(out, baseurl)
+ h.feed(html)
+ h.feed("")
+ return h.close()
+
+def html2text(html, baseurl=''):
+ return optwrap(html2text_file(html, None, baseurl))
+
+if __name__ == "__main__":
+ baseurl = ''
+ if sys.argv[1:]:
+ arg = sys.argv[1]
+ if arg.startswith('http://') or arg.startswith('https://'):
+ baseurl = arg
+ j = urllib.urlopen(baseurl)
+ try:
+ from feedparser import _getCharacterEncoding as enc
+ enc
+ except ImportError:
+ enc = lambda x, y: ('utf-8', 1)
+ text = j.read()
+ encoding = enc(j.headers, text)[0]
+ if encoding == 'us-ascii': encoding = 'utf-8'
+ data = text.decode(encoding)
+
+ else:
+ encoding = 'utf8'
+ if len(sys.argv) > 2:
+ encoding = sys.argv[2]
+ data = open(arg, 'r').read().decode(encoding)
+ else:
+ data = sys.stdin.read().decode('utf8')
+ wrapwrite(html2text(data, baseurl))
+