Merge from trunk

This commit is contained in:
Sengian 2010-11-01 01:31:39 +01:00
commit 679136f6a7
21 changed files with 429 additions and 142 deletions

View File

@ -208,6 +208,8 @@ h2.library_name {
} }
.toplevel li a { text-decoration: none; }
.toplevel li img { .toplevel li img {
vertical-align: middle; vertical-align: middle;
margin-right: 1em; margin-right: 1em;
@ -261,9 +263,16 @@ h2.library_name {
} }
.category div.category-item span.href { display: none } .category div.category-item a { text-decoration: none; color: inherit; }
#groups span.load_href { display: none } #groups a.load_href {
text-decoration: none;
color: inherit;
font-size: medium;
font-weight: normal;
padding: 0;
padding-left: 0.5em;
}
#groups h3 { #groups h3 {
font-weight: bold; font-weight: bold;

View File

@ -116,7 +116,7 @@ function toplevel() {
$(".sort_select").hide(); $(".sort_select").hide();
$(".toplevel li").click(function() { $(".toplevel li").click(function() {
var href = $(this).children("span.url").text(); var href = $(this).children("a").attr('href');
window.location = href; window.location = href;
}); });
@ -133,7 +133,7 @@ function render_error(msg) {
// Category feed {{{ // Category feed {{{
function category_clicked() { function category_clicked() {
var href = $(this).find("span.href").html(); var href = $(this).find("a").attr('href');
window.location = href; window.location = href;
} }
@ -151,7 +151,7 @@ function category() {
change: function(event, ui) { change: function(event, ui) {
if (ui.newContent) { if (ui.newContent) {
var href = ui.newContent.children("span.load_href").html(); var href = ui.newContent.prev().children("a.load_href").attr('href');
ui.newContent.children(".loading").show(); ui.newContent.children(".loading").show();
if (href) { if (href) {
$.ajax({ $.ajax({

View File

@ -0,0 +1,50 @@
from calibre.web.feeds.news import BasicNewsRecipe
class ClicRBS(BasicNewsRecipe):
title = u'ClicRBS'
language = 'pt'
__author__ = 'arvoredo'
oldest_article = 3
max_articles_per_feed = 9
cover_url = 'http://www.publicidade.clicrbs.com.br/clicrbs/imgs/logo_clic.gif'
remove_tags = [
dict(name='div', attrs={'class':['clic-barra-inner', 'botao-versao-mobile ']})
]
remove_tags_before = dict(name='div ', attrs={'class':'descricao'})
remove_tags_before = dict(name='div', attrs={'id':'glb-corpo'})
remove_tags_before = dict(name='div', attrs={'class':'descricao'})
remove_tags_before = dict(name='div', attrs={'class':'coluna'})
remove_tags_after = dict(name='div', attrs={'class':'extra'})
remove_tags_after = dict(name='div', attrs={'id':'links-patrocinados'})
remove_tags_after = dict(name='h4', attrs={'class':'tipo-c comente'})
remove_tags_after = dict(name='ul', attrs={'class':'lista'})
feeds = [
(u'zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?uf=1&local=1&channel=13')
, (u'diariocatarinense.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?uf=2&local=18&channel=67')
, (u'Concursos e Emprego', u'http://g1.globo.com/Rss2/0,,AS0-9654,00.xml')
, (u'Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?channel=87&uf=1&local=1')
, (u'Economia, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=801&uf=1&local=1&channel=13')
, (u'Esportes, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=802&uf=1&local=1&channel=13')
, (u'Economia, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1180&channel=87&uf=1&local=1')
, (u'Política, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1185&channel=87&uf=1&local=1')
, (u'Mundo, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1184&channel=87&uf=1&local=1')
, (u'Catarinense, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=&theme=371&uf=2&channel=2')
, (u'Geral, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1183&channel=87&uf=1&local=1')
, (u'Estilo de Vida, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=805&uf=1&local=1&channel=13')
, (u'Corrida, Corrida, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1313&theme=15704&uf=1&channel=2')
, (u'Jornal de Santa Catarina, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?espid=159&uf=2&local=18')
, (u'Grêmio, Futebol, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=11&theme=65&uf=1&channel=2')
, (u'Velocidade, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1314&theme=2655&uf=1&channel=2')
]
extra_css = '''
cite{color:#007BB5; font-size:xx-small; font-style:italic;}
body{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
h3{font-size:large; color:#082963; font-weight:bold;}
#ident{color:#0179B4; font-size:xx-small;}
p{color:#000000;font-weight:normal;}
.commentario p{color:#007BB5; font-style:italic;}
'''

View File

@ -0,0 +1,44 @@
from calibre.web.feeds.news import BasicNewsRecipe
class CMJornal_pt(BasicNewsRecipe):
title = 'Correio da Manha - Portugal'
__author__ = 'jmst'
description = 'As noticias de Portugal e do Mundo'
publisher = 'Cofina Media'
category = ''
oldest_article = 1
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'pt'
extra_css = ' .publish{font-style: italic; line-height: 1.2em; border-bottom: 1px dotted; padding: 5px 0} .entity{line-height: 1.2em} .overview{line-height:1.2em} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [
dict(name=['h2','h1'])
, dict(name='div', attrs={'class': ['news']})
]
remove_tags = [
dict(name=['object','embed','iframe'])
,dict(name='a',attrs={'href':['#']})
]
feeds = [
(u'Actualidade' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000009-0000-0000-0000-000000000009' )
,(u'Portugal' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000010-0000-0000-0000-000000000010' )
,(u'Economia' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000011-0000-0000-0000-000000000011' )
,(u'Mundo' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000091-0000-0000-0000-000000000091' )
,(u'Desporto' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000012-0000-0000-0000-000000000012' )
,(u'TV & Media', u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000092-0000-0000-0000-000000000092')
]
def print_version(self, url):
return url.replace('noticia.aspx', 'Imprimir.aspx')

View File

@ -1,53 +1,79 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>' __copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010, Louis Gesbert <meta at antislash dot info>'
''' '''
Mediapart Mediapart
''' '''
import re, string from calibre.ebooks.BeautifulSoup import Tag
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Mediapart(BasicNewsRecipe): class Mediapart(BasicNewsRecipe):
title = 'Mediapart' title = 'Mediapart'
__author__ = 'Mathieu Godlewski <mathieu at godlewski.fr>' __author__ = 'Mathieu Godlewski'
description = 'Global news in french from online newspapers' description = 'Global news in french from online newspapers'
oldest_article = 7 oldest_article = 7
language = 'fr' language = 'fr'
needs_subscription = True
max_articles_per_feed = 50 max_articles_per_feed = 50
no_stylesheets = True no_stylesheets = True
html2lrf_options = ['--base-font-size', '10'] cover_url = 'http://www.mediapart.fr/sites/all/themes/mediapart/mediapart/images/annonce.jpg'
feeds = [ feeds = [
('Les articles', 'http://www.mediapart.fr/articles/feed'), ('Les articles', 'http://www.mediapart.fr/articles/feed'),
] ]
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in # -- print-version has poor quality on this website, better do the conversion ourselves
[ #
(r'<div class="print-title">([^>]+)</div>', lambda match : '<h2>'+match.group(1)+'</h2>'), # preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
(r'<p>Mediapart\.fr</p>', lambda match : ''), # [
(r'<p[^>]*>[\s]*</p>', lambda match : ''), # (r'<div class="print-title">([^>]+)</div>', lambda match : '<h2>'+match.group(1)+'</h2>'),
(r'<p><a href="[^\.]+\.pdf">[^>]*</a></p>', lambda match : ''), # (r'<span class=\'auteur_staff\'>[^>]+<a title=\'[^\']*\'[^>]*>([^<]*)</a>[^<]*</span>',
# lambda match : '<i>'+match.group(1)+'</i>'),
# (r'\'', lambda match: '&rsquo;'),
# ]
# ]
#
# remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}),
# dict(name='div', attrs={'class':'print-links'}),
# dict(name='img', attrs={'src':'entete_article.png'}),
# dict(name='br') ]
#
# def print_version(self, url):
# raw = self.browser.open(url).read()
# soup = BeautifulSoup(raw.decode('utf8', 'replace'))
# div = soup.find('div', {'id':re.compile('node-\d+')})
# if div is None:
# return None
# article_id = string.replace(div['id'], 'node-', '')
# if article_id is None:
# return None
# return 'http://www.mediapart.fr/print/'+article_id
# -- Non-print version [dict(name='div', attrs={'class':'advert'})]
keep_only_tags = [
dict(name='h1', attrs={'class':'title'}),
dict(name='div', attrs={'class':'page_papier_detail'}),
] ]
]
remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}), def preprocess_html(self,soup):
dict(name='div', attrs={'class':'print-links'}), for title in soup.findAll('div', {'class':'titre'}):
dict(name='img', attrs={'src':'entete_article.png'}), tag = Tag(soup, 'h3')
] title.replaceWith(tag)
tag.insert(0,title)
return soup
# -- Handle login
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.mediapart.fr/')
br.select_form(nr=1)
br['name'] = self.username
br['pass'] = self.password
br.submit()
return br
def print_version(self, url):
raw = self.browser.open(url).read()
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
div = soup.find('div', {'class':'node node-type-article'})
if div is None:
return None
article_id = string.replace(div['id'], 'node-', '')
if article_id is None:
return None
return 'http://www.mediapart.fr/print/'+article_id

View File

@ -0,0 +1,53 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Louis Gesbert <meta at antislash dot info>'
'''
Rue89
'''
__author__ = '2010, Louis Gesbert <meta at antislash dot info>'
import re
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.news import BasicNewsRecipe
class Rue89(BasicNewsRecipe):
title = 'Rue89'
__author__ = 'Louis Gesbert'
description = 'Popular free french news website'
title = u'Rue89'
language = 'fr'
oldest_article = 7
max_articles_per_feed = 50
feeds = [(u'La Une', u'http://www.rue89.com/homepage/feed')]
no_stylesheets = True
preprocess_regexps = [
(re.compile(r'<(/?)h2>', re.IGNORECASE|re.DOTALL),
lambda match : '<'+match.group(1)+'h3>'),
(re.compile(r'<div class="print-title">([^>]+)</div>', re.IGNORECASE|re.DOTALL),
lambda match : '<h2>'+match.group(1)+'</h2>'),
(re.compile(r'<img[^>]+src="[^"]*/numeros/(\d+)[^0-9.">]*.gif"[^>]*/>', re.IGNORECASE|re.DOTALL),
lambda match : '<span style="font-family: Sans-serif; color: red; font-size:24pt; padding=2pt;">'+match.group(1)+'</span>'),
(re.compile(r'\''), lambda match: '&rsquo;'),
]
def preprocess_html(self,soup):
body = Tag(soup, 'body')
title = soup.find('h1', {'class':'title'})
content = soup.find('div', {'class':'content'})
soup.body.replaceWith(body)
body.insert(0, title)
body.insert(1, content)
return soup
remove_tags = [ #dict(name='div', attrs={'class':'print-source_url'}),
#dict(name='div', attrs={'class':'print-links'}),
#dict(name='img', attrs={'class':'print-logo'}),
dict(name='div', attrs={'class':'content_top'}),
dict(name='div', attrs={'id':'sidebar-left'}), ]
# -- print-version has poor quality on this website, better do the conversion ourselves
# def print_version(self, url):
# return re.sub('^.*-([0-9]+)$', 'http://www.rue89.com/print/\\1',url)

View File

@ -28,7 +28,9 @@ If there are no windows binaries already compiled for the version of python you
Run the following command to install python dependencies:: Run the following command to install python dependencies::
easy_install --always-unzip -U ipython mechanize BeautifulSoup pyreadline python-dateutil dnspython easy_install --always-unzip -U ipython mechanize pyreadline python-dateutil dnspython
Install BeautifulSoup 3.0.x manually into site-packages (3.1.x parses broken HTML very poorly)
Qt Qt
-------- --------

View File

@ -5,10 +5,38 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import subprocess, tempfile, os, time import subprocess, tempfile, os, time, sys
from threading import RLock
from setup import Command from setup import Command
try:
from pyinotify import WatchManager, ThreadedNotifier, EventsCodes, ProcessEvent
except:
wm = None
else:
wm = WatchManager()
flags = EventsCodes.ALL_FLAGS
mask = flags['IN_MODIFY']
class ProcessEvents(ProcessEvent):
def __init__(self, command):
ProcessEvent.__init__(self)
self.command = command
def process_default(self, event):
name = getattr(event,
'name', None)
if name and os.path.splitext(name)[1] == '.py':
print
print name, 'changed'
self.command.kill_server()
self.command.launch_server()
print self.command.prompt,
sys.stdout.flush()
class Server(Command): class Server(Command):
description = 'Run the calibre server in development mode conveniently' description = 'Run the calibre server in development mode conveniently'
@ -18,31 +46,62 @@ class Server(Command):
def rebuild_monocole(self): def rebuild_monocole(self):
subprocess.check_call(['sprocketize', '-C', self.MONOCLE_PATH, subprocess.check_call(['sprocketize', '-C', self.MONOCLE_PATH,
'-I', 'src', 'src/monocle.js'], '-I', 'src', 'src/monocle.js'],
stdout=open('resources/content_server/monocle.js', 'wb')) stdout=open('resources/content_server/read/monocle.js', 'wb'))
def launch_server(self, log): def launch_server(self):
self.rebuild_monocole() print 'Starting server...\n'
p = subprocess.Popen(['calibre-server', '--develop'], with self.lock:
stderr=subprocess.STDOUT, stdout=log) self.rebuild_monocole()
return p self.server_proc = p = subprocess.Popen(['calibre-server', '--develop'],
stderr=subprocess.STDOUT, stdout=self.server_log)
time.sleep(0.2)
if p.poll() is not None:
print 'Starting server failed'
raise SystemExit(1)
return p
def kill_server(self):
print 'Killing server...\n'
if self.server_proc is not None:
with self.lock:
if self.server_proc.poll() is None:
self.server_proc.terminate()
while self.server_proc.poll() is None:
time.sleep(0.1)
def watch(self):
if wm is not None:
self.notifier = ThreadedNotifier(wm, ProcessEvents(self))
self.notifier.start()
self.wdd = wm.add_watch(os.path.abspath('src'), mask, rec=True)
def run(self, opts): def run(self, opts):
self.lock = RLock()
tdir = tempfile.gettempdir() tdir = tempfile.gettempdir()
logf = os.path.join(tdir, 'calibre-server.log') logf = os.path.join(tdir, 'calibre-server.log')
log = open(logf, 'ab') self.server_log = open(logf, 'ab')
self.prompt = 'Press Enter to kill/restart server. Ctrl+C to quit: '
print 'Server log available at:', logf print 'Server log available at:', logf
print
self.watch()
first = True
while True: while True:
print 'Starting server...' self.launch_server()
p = self.launch_server(log) if first:
pass
first = False
try: try:
raw_input('Press Enter to kill/restart server. Ctrl+C to quit: ') raw_input(self.prompt)
except: except:
print
self.kill_server()
break break
else: else:
while p.returncode is None: self.kill_server()
p.terminate()
time.sleep(0.1)
p.kill()
print print
if hasattr(self, 'notifier'):
self.notifier.stop()

View File

@ -475,7 +475,7 @@ from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \
SOVOS, PICO SOVOS, PICO
from calibre.devices.sne.driver import SNE from calibre.devices.sne.driver import SNE
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \ from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO from calibre.devices.kobo.driver import KOBO
@ -586,6 +586,7 @@ plugins += [
AVANT, AVANT,
MENTOR, MENTOR,
SWEEX, SWEEX,
Q600,
KOGAN, KOGAN,
PDNOVEL, PDNOVEL,
SPECTRA, SPECTRA,

View File

@ -19,7 +19,7 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.epub import set_metadata from calibre.ebooks.metadata.epub import set_metadata
from calibre.library.server.utils import strftime from calibre.library.server.utils import strftime
from calibre.utils.config import config_dir, prefs from calibre.utils.config import config_dir, prefs
from calibre.utils.date import isoformat, now, parse_date from calibre.utils.date import now, parse_date
from calibre.utils.logging import Log from calibre.utils.logging import Log
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
@ -2521,11 +2521,11 @@ class ITUNES(DriverBase):
metadata.timestamp = datetime.datetime(old_ts.year, old_ts.month, old_ts.day, old_ts.hour, metadata.timestamp = datetime.datetime(old_ts.year, old_ts.month, old_ts.day, old_ts.hour,
old_ts.minute, old_ts.second, old_ts.microsecond+1, old_ts.tzinfo) old_ts.minute, old_ts.second, old_ts.microsecond+1, old_ts.tzinfo)
else: else:
metadata.timestamp = isoformat(now()) metadata.timestamp = now()
if DEBUG: if DEBUG:
self.log.info(" add timestamp: %s" % metadata.timestamp) self.log.info(" add timestamp: %s" % metadata.timestamp)
else: else:
metadata.timestamp = isoformat(now()) metadata.timestamp = now()
if DEBUG: if DEBUG:
self.log.warning(" missing <metadata> block in OPF file") self.log.warning(" missing <metadata> block in OPF file")
self.log.info(" add timestamp: %s" % metadata.timestamp) self.log.info(" add timestamp: %s" % metadata.timestamp)

View File

@ -72,6 +72,15 @@ class SWEEX(USBMS):
EBOOK_DIR_MAIN = '' EBOOK_DIR_MAIN = ''
SUPPORTS_SUB_DIRS = True SUPPORTS_SUB_DIRS = True
class Q600(SWEEX):
name = 'Digma Q600 Device interface'
gui_name = 'Q600'
description = _('Communicate with the Digma Q600')
BCD = [0x325]
FORMATS = ['epub', 'fb2', 'mobi', 'prc', 'html', 'rtf', 'chm', 'pdf', 'txt']
class KOGAN(SWEEX): class KOGAN(SWEEX):
name = 'Kogan Device Interface' name = 'Kogan Device Interface'

View File

@ -9,11 +9,13 @@ Fetch metadata using Amazon AWS
import sys, re import sys, re
from lxml import html from lxml import html
from lxml.html import soupparser
from calibre import browser from calibre import browser
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.library.comments import sanitize_comments_html
def find_asin(br, isbn): def find_asin(br, isbn):
q = 'http://www.amazon.com/s?field-keywords='+isbn q = 'http://www.amazon.com/s?field-keywords='+isbn
@ -70,7 +72,7 @@ def get_metadata(br, asin, mi):
return False return False
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0] resolve_entities=True)[0]
root = html.fromstring(raw) root = soupparser.fromstring(raw)
ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]') ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]')
if ratings: if ratings:
pat = re.compile(r'([0-9.]+) out of (\d+) stars') pat = re.compile(r'([0-9.]+) out of (\d+) stars')
@ -95,25 +97,26 @@ def get_metadata(br, asin, mi):
# remove all attributes from tags # remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Collapse whitespace # Collapse whitespace
desc = re.sub('\n+', '\n', desc) #desc = re.sub('\n+', '\n', desc)
desc = re.sub(' +', ' ', desc) #desc = re.sub(' +', ' ', desc)
# Remove the notice about text referring to out of print editions # Remove the notice about text referring to out of print editions
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
# Remove comments # Remove comments
desc = re.sub(r'(?s)<!--.*?-->', '', desc) desc = re.sub(r'(?s)<!--.*?-->', '', desc)
mi.comments = desc mi.comments = sanitize_comments_html(desc)
return True return True
def main(args=sys.argv): def main(args=sys.argv):
# Test xisbn # Test xisbn
print get_social_metadata('Learning Python', None, None, '8324616489') #print get_social_metadata('Learning Python', None, None, '8324616489')
print #print
# Test sophisticated comment formatting # Test sophisticated comment formatting
print get_social_metadata('Swan Thieves', None, None, '9780316065795') print get_social_metadata('Angels & Demons', None, None, '9781416580829')
print print
return
# Random tests # Random tests
print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720') print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')

View File

@ -484,17 +484,22 @@ class DeviceMenu(QMenu): # {{{
_('Storage Card B')), _('Storage Card B')),
] ]
later_menus = []
for menu in (self, self.set_default_menu): for menu in (self, self.set_default_menu):
for actions, desc in ( for actions, desc in (
(basic_actions, ''), (basic_actions, ''),
(specific_actions, _('Send specific format to')),
(delete_actions, _('Send and delete from library')), (delete_actions, _('Send and delete from library')),
(specific_actions, _('Send specific format to'))
): ):
mdest = menu mdest = menu
if actions is not basic_actions: if actions is not basic_actions:
mdest = menu.addMenu(desc) mdest = QMenu(desc)
self._memory.append(mdest) self._memory.append(mdest)
later_menus.append(mdest)
if menu is self.set_default_menu:
menu.addMenu(mdest)
menu.addSeparator()
for dest, delete, specific, icon, text in actions: for dest, delete, specific, icon, text in actions:
action = DeviceAction(dest, delete, specific, icon, text, self) action = DeviceAction(dest, delete, specific, icon, text, self)
@ -507,7 +512,7 @@ class DeviceMenu(QMenu): # {{{
action.a_s.connect(self.action_triggered) action.a_s.connect(self.action_triggered)
self.actions.append(action) self.actions.append(action)
mdest.addAction(action) mdest.addAction(action)
if actions is not specific_actions: if actions is basic_actions:
menu.addSeparator() menu.addSeparator()
da = config['default_send_to_device_action'] da = config['default_send_to_device_action']
@ -525,14 +530,21 @@ class DeviceMenu(QMenu): # {{{
self.group.triggered.connect(self.change_default_action) self.group.triggered.connect(self.change_default_action)
self.addSeparator() self.addSeparator()
self.addMenu(later_menus[0])
self.addSeparator()
mitem = self.addAction(QIcon(I('eject.png')), _('Eject device')) mitem = self.addAction(QIcon(I('eject.png')), _('Eject device'))
mitem.setEnabled(False) mitem.setEnabled(False)
mitem.triggered.connect(lambda x : self.disconnect_mounted_device.emit()) mitem.triggered.connect(lambda x : self.disconnect_mounted_device.emit())
self.disconnect_mounted_device_action = mitem self.disconnect_mounted_device_action = mitem
self.addSeparator() self.addSeparator()
self.addMenu(self.set_default_menu) self.addMenu(self.set_default_menu)
self.addSeparator() self.addSeparator()
self.addMenu(later_menus[1])
self.addSeparator()
annot = self.addAction(_('Fetch annotations (experimental)')) annot = self.addAction(_('Fetch annotations (experimental)'))
annot.setEnabled(False) annot.setEnabled(False)
annot.triggered.connect(lambda x : annot.triggered.connect(lambda x :

View File

@ -11,11 +11,15 @@ from calibre.constants import preferred_encoding
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \
CData, Comment, Declaration, ProcessingInstruction CData, Comment, Declaration, ProcessingInstruction
from calibre import prepare_string_for_xml from calibre import prepare_string_for_xml
from calibre.utils.html2text import html2text
from calibre.ebooks.markdown import markdown
# Hackish - ignoring sentences ending or beginning in numbers to avoid # Hackish - ignoring sentences ending or beginning in numbers to avoid
# confusion with decimal points. # confusion with decimal points.
lost_cr_pat = re.compile('([a-z])([\.\?!])([A-Z])') lost_cr_pat = re.compile('([a-z])([\.\?!])([A-Z])')
lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])') lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])')
sanitize_pat = re.compile(r'<script|<table|<tr|<td|<th|<style|<iframe',
re.IGNORECASE)
def comments_to_html(comments): def comments_to_html(comments):
''' '''
@ -53,6 +57,9 @@ def comments_to_html(comments):
for x in comments.split('\n\n')] for x in comments.split('\n\n')]
return '\n'.join(parts) return '\n'.join(parts)
if sanitize_pat.search(comments) is not None:
return sanitize_comments_html(comments)
# Explode lost CRs to \n\n # Explode lost CRs to \n\n
comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
'.\r'), comments) '.\r'), comments)
@ -115,6 +122,11 @@ def comments_to_html(comments):
return result.renderContents(encoding=None) return result.renderContents(encoding=None)
def sanitize_comments_html(html):
text = html2text(html)
md = markdown.Markdown(safe_mode=True)
return md.convert(text)
def test(): def test():
for pat, val in [ for pat, val in [
('lineone\n\nlinetwo', ('lineone\n\nlinetwo',

View File

@ -118,16 +118,17 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache,
self.set_database(db) self.set_database(db)
cherrypy.config.update({ cherrypy.config.update({
'log.screen' : opts.develop, 'log.screen' : opts.develop,
'engine.autoreload_on' : opts.develop, 'engine.autoreload_on' : getattr(opts,
'tools.log_headers.on' : opts.develop, 'auto_reload', False),
'checker.on' : opts.develop, 'tools.log_headers.on' : opts.develop,
'request.show_tracebacks': show_tracebacks, 'checker.on' : opts.develop,
'server.socket_host' : listen_on, 'request.show_tracebacks': show_tracebacks,
'server.socket_port' : opts.port, 'server.socket_host' : listen_on,
'server.socket_timeout' : opts.timeout, #seconds 'server.socket_port' : opts.port,
'server.thread_pool' : opts.thread_pool, # number of threads 'server.socket_timeout' : opts.timeout, #seconds
}) 'server.thread_pool' : opts.thread_pool, # number of threads
})
if embedded or wsgi: if embedded or wsgi:
cherrypy.config.update({'engine.SIGHUP' : None, cherrypy.config.update({'engine.SIGHUP' : None,
'engine.SIGTERM' : None,}) 'engine.SIGTERM' : None,})

View File

@ -123,9 +123,10 @@ def get_category_items(category, items, restriction, datatype, prefix): # {{{
def item(i): def item(i):
templ = (u'<div title="{4}" class="category-item">' templ = (u'<div title="{4}" class="category-item">'
'<div class="category-name">{0}</div><div>{1}</div>' '<div class="category-name">'
'<div>{2}' '<a href="{5}{3}" title="{4}">{0}</a></div>'
'<span class="href">{5}{3}</span></div></div>') '<div>{1}</div>'
'<div>{2}</div></div>')
rating, rstring = render_rating(i.avg_rating, prefix) rating, rstring = render_rating(i.avg_rating, prefix)
name = xml(i.name) name = xml(i.name)
if datatype == 'rating': if datatype == 'rating':
@ -142,7 +143,7 @@ def get_category_items(category, items, restriction, datatype, prefix): # {{{
q = category q = category
href = '/browse/matches/%s/%s'%(quote(q), quote(id_)) href = '/browse/matches/%s/%s'%(quote(q), quote(id_))
return templ.format(xml(name), rating, return templ.format(xml(name), rating,
xml(desc), xml(href), rstring, prefix) xml(desc), xml(href, True), rstring, prefix)
items = list(map(item, items)) items = list(map(item, items))
return '\n'.join(['<div class="category-container">'] + items + ['</div>']) return '\n'.join(['<div class="category-container">'] + items + ['</div>'])
@ -252,8 +253,6 @@ class BrowseServer(object):
lp = self.db.library_path lp = self.db.library_path
if isbytestring(lp): if isbytestring(lp):
lp = force_unicode(lp, filesystem_encoding) lp = force_unicode(lp, filesystem_encoding)
if isinstance(ans, unicode):
ans = ans.encode('utf-8')
ans = ans.replace('{library_name}', xml(os.path.basename(lp))) ans = ans.replace('{library_name}', xml(os.path.basename(lp)))
ans = ans.replace('{library_path}', xml(lp, True)) ans = ans.replace('{library_path}', xml(lp, True))
ans = ans.replace('{initial_search}', initial_search) ans = ans.replace('{initial_search}', initial_search)
@ -335,9 +334,10 @@ class BrowseServer(object):
icon = 'blank.png' icon = 'blank.png'
cats.append((meta['name'], category, icon)) cats.append((meta['name'], category, icon))
cats = [('<li title="{2} {0}"><img src="{3}{src}" alt="{0}" />' cats = [('<li><a title="{2} {0}" href="/browse/category/{1}">&nbsp;</a>'
'<img src="{3}{src}" alt="{0}" />'
'<span class="label">{0}</span>' '<span class="label">{0}</span>'
'<span class="url">{3}/browse/category/{1}</span></li>') '</li>')
.format(xml(x, True), xml(quote(y)), xml(_('Browse books by')), .format(xml(x, True), xml(quote(y)), xml(_('Browse books by')),
self.opts.url_prefix, src='/browse/icon/'+z) self.opts.url_prefix, src='/browse/icon/'+z)
for x, y, z in cats] for x, y, z in cats]
@ -393,14 +393,15 @@ class BrowseServer(object):
for x in sorted(starts): for x in sorted(starts):
category_groups[x] = len([y for y in items if category_groups[x] = len([y for y in items if
getter(y).upper().startswith(x)]) getter(y).upper().startswith(x)])
items = [(u'<h3 title="{0}">{0} <span>[{2}]</span></h3><div>' items = [(u'<h3 title="{0}"><a class="load_href" title="{0}"'
u' href="{4}{3}"><strong>{0}</strong> [{2}]</a></h3><div>'
u'<div class="loaded" style="display:none"></div>' u'<div class="loaded" style="display:none"></div>'
u'<div class="loading"><img alt="{1}" src="{4}/static/loading.gif" /><em>{1}</em></div>' u'<div class="loading"><img alt="{1}" src="{4}/static/loading.gif" /><em>{1}</em></div>'
u'<span class="load_href">{4}{3}</span></div>').format( u'</div>').format(
xml(s, True), xml(s, True),
xml(_('Loading, please wait'))+'&hellip;', xml(_('Loading, please wait'))+'&hellip;',
unicode(c), unicode(c),
xml(u'/browse/category_group/%s/%s'%(category, s)), xml(u'/browse/category_group/%s/%s'%(category, s), True),
self.opts.url_prefix) self.opts.url_prefix)
for s, c in category_groups.items()] for s, c in category_groups.items()]
items = '\n\n'.join(items) items = '\n\n'.join(items)
@ -563,7 +564,8 @@ class BrowseServer(object):
if not val: if not val:
val = '' val = ''
args[key] = xml(val, True) args[key] = xml(val, True)
fname = ascii_filename(args['title']) + ' - ' + ascii_filename(args['authors']) fname = quote(ascii_filename(args['title']) + ' - ' +
ascii_filename(args['authors']))
return args, fmt, fmts, fname return args, fmt, fmts, fname
@Endpoint(mimetype='application/json; charset=utf-8') @Endpoint(mimetype='application/json; charset=utf-8')

View File

@ -70,10 +70,10 @@ class ContentServer(object):
id = id.rpartition('_')[-1].partition('.')[0] id = id.rpartition('_')[-1].partition('.')[0]
match = re.search(r'\d+', id) match = re.search(r'\d+', id)
if not match: if not match:
raise cherrypy.HTTPError(400, 'id:%s not an integer'%id) raise cherrypy.HTTPError(404, 'id:%s not an integer'%id)
id = int(match.group()) id = int(match.group())
if not self.db.has_id(id): if not self.db.has_id(id):
raise cherrypy.HTTPError(400, 'id:%d does not exist in database'%id) raise cherrypy.HTTPError(404, 'id:%d does not exist in database'%id)
if what == 'thumb' or what.startswith('thumb_'): if what == 'thumb' or what.startswith('thumb_'):
try: try:
width, height = map(int, what.split('_')[1:]) width, height = map(int, what.split('_')[1:])

View File

@ -58,6 +58,9 @@ The OPDS interface is advertised via BonJour automatically.
help=_('Specifies a restriction to be used for this invocation. ' help=_('Specifies a restriction to be used for this invocation. '
'This option overrides any per-library settings specified' 'This option overrides any per-library settings specified'
' in the GUI')) ' in the GUI'))
parser.add_option('--auto-reload', default=False, action='store_true',
help=_('Auto reload server when source code changes. May not'
' work in all environments.'))
return parser return parser

View File

@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
import re, os import re, os
import __builtin__ import __builtin__
from urllib import quote
import cherrypy import cherrypy
from lxml import html from lxml import html
@ -115,8 +116,8 @@ def build_index(books, num, search, sort, order, start, total, url_base, CKEYS,
data = TD() data = TD()
for fmt in book['formats'].split(','): for fmt in book['formats'].split(','):
a = ascii_filename(book['authors']) a = quote(ascii_filename(book['authors']))
t = ascii_filename(book['title']) t = quote(ascii_filename(book['title']))
s = SPAN( s = SPAN(
A( A(
fmt.lower(), fmt.lower(),

View File

@ -17,18 +17,18 @@ sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
try: from textwrap import wrap try: from textwrap import wrap
except: pass except: pass
# Use Unicode characters instead of their ascii pseudo-replacements # Use Unicode characters instead of their ascii psuedo-replacements
UNICODE_SNOB = 0 UNICODE_SNOB = 0
# Put the links after each paragraph instead of at the end. # Put the links after each paragraph instead of at the end.
LINKS_EACH_PARAGRAPH = 0 LINKS_EACH_PARAGRAPH = 0
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
BODY_WIDTH = 0 BODY_WIDTH = 78
# Don't show internal links (href="#local-anchor") -- corresponding link targets # Don't show internal links (href="#local-anchor") -- corresponding link targets
# won't be visible in the plain text file anyway. # won't be visible in the plain text file anyway.
SKIP_INTERNAL_LINKS = True SKIP_INTERNAL_LINKS = False
### Entity Nonsense ### ### Entity Nonsense ###
@ -41,13 +41,13 @@ def name2cp(k):
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0]) return ord(codecs.latin_1_decode(k)[0])
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
'ndash':'-', 'oelig':'oe', 'aelig':'ae', 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
unifiable_n = {} unifiable_n = {}
@ -60,7 +60,7 @@ def charref(name):
c = int(name[1:], 16) c = int(name[1:], 16)
else: else:
c = int(name) c = int(name)
if not UNICODE_SNOB and c in unifiable_n.keys(): if not UNICODE_SNOB and c in unifiable_n.keys():
return unifiable_n[c] return unifiable_n[c]
else: else:
@ -76,14 +76,14 @@ def entityref(c):
def replaceEntities(s): def replaceEntities(s):
s = s.group(1) s = s.group(1)
if s[0] == "#": if s[0] == "#":
return charref(s[1:]) return charref(s[1:])
else: return entityref(s) else: return entityref(s)
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape(s): def unescape(s):
return r_unescape.sub(replaceEntities, s) return r_unescape.sub(replaceEntities, s)
def fixattrs(attrs): def fixattrs(attrs):
# Fix bug in sgmllib.py # Fix bug in sgmllib.py
if not attrs: return attrs if not attrs: return attrs
@ -105,7 +105,7 @@ def optwrap(text):
"""Wrap all paragraphs in the provided text.""" """Wrap all paragraphs in the provided text."""
if not BODY_WIDTH: if not BODY_WIDTH:
return text return text
assert wrap, "Requires Python 2.3." assert wrap, "Requires Python 2.3."
result = '' result = ''
newlines = 0 newlines = 0
@ -136,7 +136,7 @@ def hn(tag):
class _html2text(sgmllib.SGMLParser): class _html2text(sgmllib.SGMLParser):
def __init__(self, out=None, baseurl=''): def __init__(self, out=None, baseurl=''):
sgmllib.SGMLParser.__init__(self) sgmllib.SGMLParser.__init__(self)
if out is None: self.out = self.outtextf if out is None: self.out = self.outtextf
else: self.out = out else: self.out = out
self.outtext = u'' self.outtext = u''
@ -157,43 +157,43 @@ class _html2text(sgmllib.SGMLParser):
self.abbr_data = None # last inner HTML (for abbr being defined) self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later self.abbr_list = {} # stack of abbreviations to write later
self.baseurl = baseurl self.baseurl = baseurl
def outtextf(self, s): def outtextf(self, s):
self.outtext += s self.outtext += s
def close(self): def close(self):
sgmllib.SGMLParser.close(self) sgmllib.SGMLParser.close(self)
self.pbr() self.pbr()
self.o('', 0, 'end') self.o('', 0, 'end')
return self.outtext return self.outtext
def handle_charref(self, c): def handle_charref(self, c):
self.o(charref(c)) self.o(charref(c))
def handle_entityref(self, c): def handle_entityref(self, c):
self.o(entityref(c)) self.o(entityref(c))
def unknown_starttag(self, tag, attrs): def unknown_starttag(self, tag, attrs):
self.handle_tag(tag, attrs, 1) self.handle_tag(tag, attrs, 1)
def unknown_endtag(self, tag): def unknown_endtag(self, tag):
self.handle_tag(tag, None, 0) self.handle_tag(tag, None, 0)
def previousIndex(self, attrs): def previousIndex(self, attrs):
""" returns the index of certain set of attributes (of a link) in the """ returns the index of certain set of attributes (of a link) in the
self.a list self.a list
If the set of attributes is not found, returns None If the set of attributes is not found, returns None
""" """
if not attrs.has_key('href'): return None if not attrs.has_key('href'): return None
i = -1 i = -1
for a in self.a: for a in self.a:
i += 1 i += 1
match = 0 match = 0
if a.has_key('href') and a['href'] == attrs['href']: if a.has_key('href') and a['href'] == attrs['href']:
if a.has_key('title') or attrs.has_key('title'): if a.has_key('title') or attrs.has_key('title'):
if (a.has_key('title') and attrs.has_key('title') and if (a.has_key('title') and attrs.has_key('title') and
@ -206,13 +206,13 @@ class _html2text(sgmllib.SGMLParser):
def handle_tag(self, tag, attrs, start): def handle_tag(self, tag, attrs, start):
attrs = fixattrs(attrs) attrs = fixattrs(attrs)
if hn(tag): if hn(tag):
self.p() self.p()
if start: self.o(hn(tag)*"#" + ' ') if start: self.o(hn(tag)*"#" + ' ')
if tag in ['p', 'div']: self.p() if tag in ['p', 'div']: self.p()
if tag == "br" and start: self.o(" \n") if tag == "br" and start: self.o(" \n")
if tag == "hr" and start: if tag == "hr" and start:
@ -220,21 +220,21 @@ class _html2text(sgmllib.SGMLParser):
self.o("* * *") self.o("* * *")
self.p() self.p()
if tag in ["head", "style", 'script']: if tag in ["head", "style", 'script']:
if start: self.quiet += 1 if start: self.quiet += 1
else: self.quiet -= 1 else: self.quiet -= 1
if tag in ["body"]: if tag in ["body"]:
self.quiet = 0 # sites like 9rules.com never close <head> self.quiet = 0 # sites like 9rules.com never close <head>
if tag == "blockquote": if tag == "blockquote":
if start: if start:
self.p(); self.o('> ', 0, 1); self.start = 1 self.p(); self.o('> ', 0, 1); self.start = 1
self.blockquote += 1 self.blockquote += 1
else: else:
self.blockquote -= 1 self.blockquote -= 1
self.p() self.p()
if tag in ['em', 'i', 'u']: self.o("_") if tag in ['em', 'i', 'u']: self.o("_")
if tag in ['strong', 'b']: self.o("**") if tag in ['strong', 'b']: self.o("**")
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
@ -243,7 +243,7 @@ class _html2text(sgmllib.SGMLParser):
attrsD = {} attrsD = {}
for (x, y) in attrs: attrsD[x] = y for (x, y) in attrs: attrsD[x] = y
attrs = attrsD attrs = attrsD
self.abbr_title = None self.abbr_title = None
self.abbr_data = '' self.abbr_data = ''
if attrs.has_key('title'): if attrs.has_key('title'):
@ -253,13 +253,13 @@ class _html2text(sgmllib.SGMLParser):
self.abbr_list[self.abbr_data] = self.abbr_title self.abbr_list[self.abbr_data] = self.abbr_title
self.abbr_title = None self.abbr_title = None
self.abbr_data = '' self.abbr_data = ''
if tag == "a": if tag == "a":
if start: if start:
attrsD = {} attrsD = {}
for (x, y) in attrs: attrsD[x] = y for (x, y) in attrs: attrsD[x] = y
attrs = attrsD attrs = attrsD
if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
self.astack.append(attrs) self.astack.append(attrs)
self.o("[") self.o("[")
else: else:
@ -277,7 +277,7 @@ class _html2text(sgmllib.SGMLParser):
a['outcount'] = self.outcount a['outcount'] = self.outcount
self.a.append(a) self.a.append(a)
self.o("][" + `a['count']` + "]") self.o("][" + `a['count']` + "]")
if tag == "img" and start: if tag == "img" and start:
attrsD = {} attrsD = {}
for (x, y) in attrs: attrsD[x] = y for (x, y) in attrs: attrsD[x] = y
@ -296,20 +296,20 @@ class _html2text(sgmllib.SGMLParser):
self.o("![") self.o("![")
self.o(alt) self.o(alt)
self.o("]["+`attrs['count']`+"]") self.o("]["+`attrs['count']`+"]")
if tag == 'dl' and start: self.p() if tag == 'dl' and start: self.p()
if tag == 'dt' and not start: self.pbr() if tag == 'dt' and not start: self.pbr()
if tag == 'dd' and start: self.o(' ') if tag == 'dd' and start: self.o(' ')
if tag == 'dd' and not start: self.pbr() if tag == 'dd' and not start: self.pbr()
if tag in ["ol", "ul"]: if tag in ["ol", "ul"]:
if start: if start:
self.list.append({'name':tag, 'num':0}) self.list.append({'name':tag, 'num':0})
else: else:
if self.list: self.list.pop() if self.list: self.list.pop()
self.p() self.p()
if tag == 'li': if tag == 'li':
if start: if start:
self.pbr() self.pbr()
@ -323,10 +323,10 @@ class _html2text(sgmllib.SGMLParser):
self.start = 1 self.start = 1
else: else:
self.pbr() self.pbr()
if tag in ["table", "tr"] and start: self.p() if tag in ["table", "tr"] and start: self.p()
if tag == 'td': self.pbr() if tag == 'td': self.pbr()
if tag == "pre": if tag == "pre":
if start: if start:
self.startpre = 1 self.startpre = 1
@ -334,34 +334,34 @@ class _html2text(sgmllib.SGMLParser):
else: else:
self.pre = 0 self.pre = 0
self.p() self.p()
def pbr(self): def pbr(self):
if self.p_p == 0: self.p_p = 1 if self.p_p == 0: self.p_p = 1
def p(self): self.p_p = 2 def p(self): self.p_p = 2
def o(self, data, puredata=0, force=0): def o(self, data, puredata=0, force=0):
if self.abbr_data is not None: self.abbr_data += data if self.abbr_data is not None: self.abbr_data += data
if not self.quiet: if not self.quiet:
if puredata and not self.pre: if puredata and not self.pre:
data = re.sub('\s+', ' ', data) data = re.sub('\s+', ' ', data)
if data and data[0] == ' ': if data and data[0] == ' ':
self.space = 1 self.space = 1
data = data[1:] data = data[1:]
if not data and not force: return if not data and not force: return
if self.startpre: if self.startpre:
#self.out(" :") #TODO: not output when already one there #self.out(" :") #TODO: not output when already one there
self.startpre = 0 self.startpre = 0
bq = (">" * self.blockquote) bq = (">" * self.blockquote)
if not (force and data and data[0] == ">") and self.blockquote: bq += " " if not (force and data and data[0] == ">") and self.blockquote: bq += " "
if self.pre: if self.pre:
bq += " " bq += " "
data = data.replace("\n", "\n"+bq) data = data.replace("\n", "\n"+bq)
if self.start: if self.start:
self.space = 0 self.space = 0
self.p_p = 0 self.p_p = 0
@ -377,7 +377,7 @@ class _html2text(sgmllib.SGMLParser):
if self.p_p: if self.p_p:
self.out(('\n'+bq)*self.p_p) self.out(('\n'+bq)*self.p_p)
self.space = 0 self.space = 0
if self.space: if self.space:
if not self.lastWasNL: self.out(' ') if not self.lastWasNL: self.out(' ')
self.space = 0 self.space = 0
@ -388,7 +388,7 @@ class _html2text(sgmllib.SGMLParser):
newa = [] newa = []
for link in self.a: for link in self.a:
if self.outcount > link['outcount']: if self.outcount > link['outcount']:
self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href'])) self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
if link.has_key('title'): self.out(" ("+link['title']+")") if link.has_key('title'): self.out(" ("+link['title']+")")
self.out("\n") self.out("\n")
else: else:
@ -397,7 +397,7 @@ class _html2text(sgmllib.SGMLParser):
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
self.a = newa self.a = newa
if self.abbr_list and force == "end": if self.abbr_list and force == "end":
for abbr, definition in self.abbr_list.items(): for abbr, definition in self.abbr_list.items():
self.out(" *[" + abbr + "]: " + definition + "\n") self.out(" *[" + abbr + "]: " + definition + "\n")
@ -410,7 +410,7 @@ class _html2text(sgmllib.SGMLParser):
def handle_data(self, data): def handle_data(self, data):
if r'\/script>' in data: self.quiet -= 1 if r'\/script>' in data: self.quiet -= 1
self.o(data, 1) self.o(data, 1)
def unknown_decl(self, data): pass def unknown_decl(self, data): pass
def wrapwrite(text): sys.stdout.write(text.encode('utf8')) def wrapwrite(text): sys.stdout.write(text.encode('utf8'))