KG updates

This commit is contained in:
GRiker 2010-11-01 03:55:32 -07:00
commit 7116a0a744
21 changed files with 853 additions and 89 deletions

View File

@ -208,6 +208,8 @@ h2.library_name {
} }
.toplevel li a { text-decoration: none; }
.toplevel li img { .toplevel li img {
vertical-align: middle; vertical-align: middle;
margin-right: 1em; margin-right: 1em;
@ -261,9 +263,16 @@ h2.library_name {
} }
.category div.category-item span.href { display: none } .category div.category-item a { text-decoration: none; color: inherit; }
#groups span.load_href { display: none } #groups a.load_href {
text-decoration: none;
color: inherit;
font-size: medium;
font-weight: normal;
padding: 0;
padding-left: 0.5em;
}
#groups h3 { #groups h3 {
font-weight: bold; font-weight: bold;

View File

@ -116,7 +116,7 @@ function toplevel() {
$(".sort_select").hide(); $(".sort_select").hide();
$(".toplevel li").click(function() { $(".toplevel li").click(function() {
var href = $(this).children("span.url").text(); var href = $(this).children("a").attr('href');
window.location = href; window.location = href;
}); });
@ -133,7 +133,7 @@ function render_error(msg) {
// Category feed {{{ // Category feed {{{
function category_clicked() { function category_clicked() {
var href = $(this).find("span.href").html(); var href = $(this).find("a").attr('href');
window.location = href; window.location = href;
} }
@ -151,7 +151,7 @@ function category() {
change: function(event, ui) { change: function(event, ui) {
if (ui.newContent) { if (ui.newContent) {
var href = ui.newContent.children("span.load_href").html(); var href = ui.newContent.prev().children("a.load_href").attr('href');
ui.newContent.children(".loading").show(); ui.newContent.children(".loading").show();
if (href) { if (href) {
$.ajax({ $.ajax({

View File

@ -0,0 +1,50 @@
from calibre.web.feeds.news import BasicNewsRecipe
class ClicRBS(BasicNewsRecipe):
title = u'ClicRBS'
language = 'pt'
__author__ = 'arvoredo'
oldest_article = 3
max_articles_per_feed = 9
cover_url = 'http://www.publicidade.clicrbs.com.br/clicrbs/imgs/logo_clic.gif'
remove_tags = [
dict(name='div', attrs={'class':['clic-barra-inner', 'botao-versao-mobile ']})
]
remove_tags_before = dict(name='div ', attrs={'class':'descricao'})
remove_tags_before = dict(name='div', attrs={'id':'glb-corpo'})
remove_tags_before = dict(name='div', attrs={'class':'descricao'})
remove_tags_before = dict(name='div', attrs={'class':'coluna'})
remove_tags_after = dict(name='div', attrs={'class':'extra'})
remove_tags_after = dict(name='div', attrs={'id':'links-patrocinados'})
remove_tags_after = dict(name='h4', attrs={'class':'tipo-c comente'})
remove_tags_after = dict(name='ul', attrs={'class':'lista'})
feeds = [
(u'zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?uf=1&local=1&channel=13')
, (u'diariocatarinense.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?uf=2&local=18&channel=67')
, (u'Concursos e Emprego', u'http://g1.globo.com/Rss2/0,,AS0-9654,00.xml')
, (u'Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?channel=87&uf=1&local=1')
, (u'Economia, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=801&uf=1&local=1&channel=13')
, (u'Esportes, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=802&uf=1&local=1&channel=13')
, (u'Economia, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1180&channel=87&uf=1&local=1')
, (u'Política, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1185&channel=87&uf=1&local=1')
, (u'Mundo, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1184&channel=87&uf=1&local=1')
, (u'Catarinense, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=&theme=371&uf=2&channel=2')
, (u'Geral, Pioneiro.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1183&channel=87&uf=1&local=1')
, (u'Estilo de Vida, zerohora.com, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=805&uf=1&local=1&channel=13')
, (u'Corrida, Corrida, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1313&theme=15704&uf=1&channel=2')
, (u'Jornal de Santa Catarina, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?espid=159&uf=2&local=18')
, (u'Grêmio, Futebol, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=11&theme=65&uf=1&channel=2')
, (u'Velocidade, Esportes, clicRBS', u'http://www.clicrbs.com.br/jsp/rssfeed.jspx?sect_id=1314&theme=2655&uf=1&channel=2')
]
extra_css = '''
cite{color:#007BB5; font-size:xx-small; font-style:italic;}
body{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
h3{font-size:large; color:#082963; font-weight:bold;}
#ident{color:#0179B4; font-size:xx-small;}
p{color:#000000;font-weight:normal;}
.commentario p{color:#007BB5; font-style:italic;}
'''

View File

@ -0,0 +1,44 @@
from calibre.web.feeds.news import BasicNewsRecipe
class CMJornal_pt(BasicNewsRecipe):
title = 'Correio da Manha - Portugal'
__author__ = 'jmst'
description = 'As noticias de Portugal e do Mundo'
publisher = 'Cofina Media'
category = ''
oldest_article = 1
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'pt'
extra_css = ' .publish{font-style: italic; line-height: 1.2em; border-bottom: 1px dotted; padding: 5px 0} .entity{line-height: 1.2em} .overview{line-height:1.2em} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [
dict(name=['h2','h1'])
, dict(name='div', attrs={'class': ['news']})
]
remove_tags = [
dict(name=['object','embed','iframe'])
,dict(name='a',attrs={'href':['#']})
]
feeds = [
(u'Actualidade' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000009-0000-0000-0000-000000000009' )
,(u'Portugal' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000010-0000-0000-0000-000000000010' )
,(u'Economia' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000011-0000-0000-0000-000000000011' )
,(u'Mundo' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000091-0000-0000-0000-000000000091' )
,(u'Desporto' , u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000012-0000-0000-0000-000000000012' )
,(u'TV & Media', u'http://www.cmjornal.xl.pt/rss/rss.aspx?channelID=00000092-0000-0000-0000-000000000092')
]
def print_version(self, url):
return url.replace('noticia.aspx', 'Imprimir.aspx')

View File

@ -0,0 +1,77 @@
from calibre.web.feeds.news import BasicNewsRecipe
class ElFaroDeVigo(BasicNewsRecipe):
title = u'El Faro de Vigo'
oldest_article = 1
max_articles_per_feed = 100
__author__ = 'Jefferson Frantz'
description = 'Noticias de Vigo'
timefmt = ' [%d %b, %Y]'
language = 'es'
encoding = 'cp1252'
no_stylesheets = True
remove_javascript = True
feeds = [
## (u'Vigo', u'http://www.farodevigo.es/elementosInt/rss/1'),
## (u'Gran Vigo', u'http://www.farodevigo.es/elementosInt/rss/2'),
(u'Galicia', u'http://www.farodevigo.es/elementosInt/rss/4'),
(u'España', u'http://www.farodevigo.es/elementosInt/rss/6'),
(u'Mundo', u'http://www.farodevigo.es/elementosInt/rss/7'),
## (u'Opinión', u'http://www.farodevigo.es/elementosInt/rss/5'),
(u'Economía', u'http://www.farodevigo.es/elementosInt/rss/10'),
(u'Sociedad y Cultura', u'http://www.farodevigo.es/elementosInt/rss/8'),
(u'Sucesos', u'http://www.farodevigo.es/elementosInt/rss/9'),
(u'Deportes', u'http://www.farodevigo.es/elementosInt/rss/11'),
(u'Agenda', u'http://www.farodevigo.es/elementosInt/rss/21'),
(u'Gente', u'http://www.farodevigo.es/elementosInt/rss/24'),
(u'Televisión', u'http://www.farodevigo.es/elementosInt/rss/25'),
(u'Ciencia y Tecnología', u'http://www.farodevigo.es/elementosInt/rss/26')]
extra_css = '''.noticia_texto{ font-family: sans-serif; font-size: medium; text-align: justify }
h1{font-family: serif; font-size: x-large; font-weight: bold; color: #000000; text-align: center}
h2{font-family: serif; font-size: medium; font-weight: bold; color: #000000; text-align: left}
.enlacenegrita10{font-family: serif; font-size: small; font-weight: bold; color: #000000; text-align: left}
.noticia_titular{font-family: serif; font-size: x-large; font-weight: bold; color: #000000; text-align: center}'''
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
url = 'http://estaticos00.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif'
fitem = soup.find('img',src=url)
if fitem:
par = fitem.parent
par.extract()
url = 'http://estaticos01.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif'
fitem = soup.find('img',src=url)
if fitem:
par = fitem.parent
par.extract()
url = 'http://estaticos02.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif'
fitem = soup.find('img',src=url)
if fitem:
par = fitem.parent
par.extract()
return self.adeify_images(soup)
def postprocess_html(self, soup, first_fetch):
divs = soup.findAll(True, {'class':'enlacenegrita10'})
for div in divs:
div['align'] = 'left'
return soup
keep_only_tags = [dict(name='div', attrs={'class':['noticias']})]
remove_tags = [
dict(name=['object','link','script','ul','iframe','ol'])
,dict(name='div', attrs={'class':['noticiadd2', 'cintillo2', 'noticiadd', 'noticiadd2']})
,dict(name='div', attrs={'class':['imagen_derecha', 'noticiadd3', 'extraHTML']})
]

View File

@ -1,7 +1,9 @@
cense__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Eddie Lau' __copyright__ = '2010, Eddie Lau'
''' '''
modified from Singtao Toronto calibre recipe by rty modified from Singtao Toronto calibre recipe by rty
Change Log:
2010/10/31: skip repeated articles in section pages
''' '''
import datetime import datetime
@ -23,14 +25,13 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
recursions = 0 recursions = 0
conversion_options = {'linearize_tables':True} conversion_options = {'linearize_tables':True}
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'), keep_only_tags = [dict(name='h1'),
dict(attrs={'id':['newscontent01','newscontent02']})] dict(attrs={'id':['newscontent01','newscontent02']})]
def get_fetchdate(self): def get_fetchdate(self):
dt_utc = datetime.datetime.utcnow() dt_utc = datetime.datetime.utcnow()
# convert UTC to local hk time # convert UTC to local hk time - at around HKT 5.30am, all news are available
dt_local = dt_utc - datetime.timedelta(-8.0/24) dt_local = dt_utc - datetime.timedelta(-2.5/24)
return dt_local.strftime("%Y%m%d") return dt_local.strftime("%Y%m%d")
def parse_index(self): def parse_index(self):
@ -47,18 +48,14 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
divs = soup.findAll(attrs={'class': ['bullet']}) divs = soup.findAll(attrs={'class': ['bullet']})
current_articles = [] current_articles = []
included_urls = []
for i in divs: for i in divs:
a = i.find('a', href = True) a = i.find('a', href = True)
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = a.get('href', False) url = a.get('href', False)
url = 'http://news.mingpao.com/' + dateStr + '/' +url url = 'http://news.mingpao.com/' + dateStr + '/' +url
if url not in included_urls:
current_articles.append({'title': title, 'url': url, 'description':''}) current_articles.append({'title': title, 'url': url, 'description':''})
included_urls.append(url)
return current_articles return current_articles
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(width=True):
del item['width']
return soup

View File

@ -28,7 +28,9 @@ If there are no windows binaries already compiled for the version of python you
Run the following command to install python dependencies:: Run the following command to install python dependencies::
easy_install --always-unzip -U ipython mechanize BeautifulSoup pyreadline python-dateutil dnspython easy_install --always-unzip -U ipython mechanize pyreadline python-dateutil dnspython
Install BeautifulSoup 3.0.x manually into site-packages (3.1.x parses broken HTML very poorly)
Qt Qt
-------- --------

View File

@ -5,10 +5,46 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import subprocess, tempfile, os, time import subprocess, tempfile, os, time, sys, telnetlib
from threading import RLock
from setup import Command from setup import Command
try:
from pyinotify import WatchManager, ThreadedNotifier, EventsCodes, ProcessEvent
except:
wm = None
else:
wm = WatchManager()
flags = EventsCodes.ALL_FLAGS
mask = flags['IN_MODIFY']
class ProcessEvents(ProcessEvent):
def __init__(self, command):
ProcessEvent.__init__(self)
self.command = command
def process_default(self, event):
name = getattr(event,
'name', None)
if not name:
return
ext = os.path.splitext(name)[1]
reload = False
if ext == '.py':
reload = True
print
print name, 'changed'
self.command.kill_server()
self.command.launch_server()
print self.command.prompt,
sys.stdout.flush()
if reload:
self.command.reload_browser(delay=1)
class Server(Command): class Server(Command):
description = 'Run the calibre server in development mode conveniently' description = 'Run the calibre server in development mode conveniently'
@ -18,31 +54,75 @@ class Server(Command):
def rebuild_monocole(self): def rebuild_monocole(self):
subprocess.check_call(['sprocketize', '-C', self.MONOCLE_PATH, subprocess.check_call(['sprocketize', '-C', self.MONOCLE_PATH,
'-I', 'src', 'src/monocle.js'], '-I', 'src', 'src/monocle.js'],
stdout=open('resources/content_server/monocle.js', 'wb')) stdout=open('resources/content_server/read/monocle.js', 'wb'))
def launch_server(self, log): def launch_server(self):
print 'Starting server...\n'
with self.lock:
self.rebuild_monocole() self.rebuild_monocole()
p = subprocess.Popen(['calibre-server', '--develop'], self.server_proc = p = subprocess.Popen(['calibre-server', '--develop'],
stderr=subprocess.STDOUT, stdout=log) stderr=subprocess.STDOUT, stdout=self.server_log)
time.sleep(0.2)
if p.poll() is not None:
print 'Starting server failed'
raise SystemExit(1)
return p return p
def kill_server(self):
print 'Killing server...\n'
if self.server_proc is not None:
with self.lock:
if self.server_proc.poll() is None:
self.server_proc.terminate()
while self.server_proc.poll() is None:
time.sleep(0.1)
def watch(self):
if wm is not None:
self.notifier = ThreadedNotifier(wm, ProcessEvents(self))
self.notifier.start()
self.wdd = wm.add_watch(os.path.abspath('src'), mask, rec=True)
def reload_browser(self, delay=0.1):
time.sleep(delay)
try:
t = telnetlib.Telnet('localhost', 4242)
t.read_until("repl>")
t.write('BrowserReload();')
print t.read_until("repl>")
t.close()
except:
print 'Failed to reload browser'
import traceback
traceback.print_exc()
def run(self, opts): def run(self, opts):
self.lock = RLock()
tdir = tempfile.gettempdir() tdir = tempfile.gettempdir()
logf = os.path.join(tdir, 'calibre-server.log') logf = os.path.join(tdir, 'calibre-server.log')
log = open(logf, 'ab') self.server_log = open(logf, 'ab')
self.prompt = 'Press Enter to kill/restart server. Ctrl+C to quit: '
print 'Server log available at:', logf print 'Server log available at:', logf
print
self.watch()
first = True
while True: while True:
print 'Starting server...' self.launch_server()
p = self.launch_server(log) if not first:
self.reload_browser()
first = False
try: try:
raw_input('Press Enter to kill/restart server. Ctrl+C to quit: ') raw_input(self.prompt)
except: except:
print
self.kill_server()
break break
else: else:
while p.returncode is None: self.kill_server()
p.terminate()
time.sleep(0.1)
p.kill()
print print
if hasattr(self, 'notifier'):
self.notifier.stop()

View File

@ -475,7 +475,7 @@ from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \
SOVOS, PICO SOVOS, PICO
from calibre.devices.sne.driver import SNE from calibre.devices.sne.driver import SNE
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \ from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO from calibre.devices.kobo.driver import KOBO
@ -586,6 +586,7 @@ plugins += [
AVANT, AVANT,
MENTOR, MENTOR,
SWEEX, SWEEX,
Q600,
KOGAN, KOGAN,
PDNOVEL, PDNOVEL,
SPECTRA, SPECTRA,

View File

@ -19,7 +19,7 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.epub import set_metadata from calibre.ebooks.metadata.epub import set_metadata
from calibre.library.server.utils import strftime from calibre.library.server.utils import strftime
from calibre.utils.config import config_dir, prefs from calibre.utils.config import config_dir, prefs
from calibre.utils.date import isoformat, now, parse_date from calibre.utils.date import now, parse_date
from calibre.utils.logging import Log from calibre.utils.logging import Log
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
@ -2521,11 +2521,11 @@ class ITUNES(DriverBase):
metadata.timestamp = datetime.datetime(old_ts.year, old_ts.month, old_ts.day, old_ts.hour, metadata.timestamp = datetime.datetime(old_ts.year, old_ts.month, old_ts.day, old_ts.hour,
old_ts.minute, old_ts.second, old_ts.microsecond+1, old_ts.tzinfo) old_ts.minute, old_ts.second, old_ts.microsecond+1, old_ts.tzinfo)
else: else:
metadata.timestamp = isoformat(now()) metadata.timestamp = now()
if DEBUG: if DEBUG:
self.log.info(" add timestamp: %s" % metadata.timestamp) self.log.info(" add timestamp: %s" % metadata.timestamp)
else: else:
metadata.timestamp = isoformat(now()) metadata.timestamp = now()
if DEBUG: if DEBUG:
self.log.warning(" missing <metadata> block in OPF file") self.log.warning(" missing <metadata> block in OPF file")
self.log.info(" add timestamp: %s" % metadata.timestamp) self.log.info(" add timestamp: %s" % metadata.timestamp)

View File

@ -72,6 +72,15 @@ class SWEEX(USBMS):
EBOOK_DIR_MAIN = '' EBOOK_DIR_MAIN = ''
SUPPORTS_SUB_DIRS = True SUPPORTS_SUB_DIRS = True
class Q600(SWEEX):
name = 'Digma Q600 Device interface'
gui_name = 'Q600'
description = _('Communicate with the Digma Q600')
BCD = [0x325]
FORMATS = ['epub', 'fb2', 'mobi', 'prc', 'html', 'rtf', 'chm', 'pdf', 'txt']
class KOGAN(SWEEX): class KOGAN(SWEEX):
name = 'Kogan Device Interface' name = 'Kogan Device Interface'

View File

@ -9,11 +9,13 @@ Fetch metadata using Amazon AWS
import sys, re import sys, re
from lxml import html from lxml import html
from lxml.html import soupparser
from calibre import browser from calibre import browser
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.library.comments import sanitize_comments_html
def find_asin(br, isbn): def find_asin(br, isbn):
q = 'http://www.amazon.com/s?field-keywords='+isbn q = 'http://www.amazon.com/s?field-keywords='+isbn
@ -70,7 +72,10 @@ def get_metadata(br, asin, mi):
return False return False
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0] resolve_entities=True)[0]
root = html.fromstring(raw) try:
root = soupparser.fromstring(raw)
except:
return False
ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]') ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]')
if ratings: if ratings:
pat = re.compile(r'([0-9.]+) out of (\d+) stars') pat = re.compile(r'([0-9.]+) out of (\d+) stars')
@ -95,25 +100,26 @@ def get_metadata(br, asin, mi):
# remove all attributes from tags # remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Collapse whitespace # Collapse whitespace
desc = re.sub('\n+', '\n', desc) #desc = re.sub('\n+', '\n', desc)
desc = re.sub(' +', ' ', desc) #desc = re.sub(' +', ' ', desc)
# Remove the notice about text referring to out of print editions # Remove the notice about text referring to out of print editions
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
# Remove comments # Remove comments
desc = re.sub(r'(?s)<!--.*?-->', '', desc) desc = re.sub(r'(?s)<!--.*?-->', '', desc)
mi.comments = desc mi.comments = sanitize_comments_html(desc)
return True return True
def main(args=sys.argv): def main(args=sys.argv):
# Test xisbn # Test xisbn
print get_social_metadata('Learning Python', None, None, '8324616489') #print get_social_metadata('Learning Python', None, None, '8324616489')
print #print
# Test sophisticated comment formatting # Test sophisticated comment formatting
print get_social_metadata('Swan Thieves', None, None, '9780316065795') print get_social_metadata('Angels & Demons', None, None, '9781416580829')
print print
return
# Random tests # Random tests
print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720') print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')

View File

@ -484,17 +484,22 @@ class DeviceMenu(QMenu): # {{{
_('Storage Card B')), _('Storage Card B')),
] ]
later_menus = []
for menu in (self, self.set_default_menu): for menu in (self, self.set_default_menu):
for actions, desc in ( for actions, desc in (
(basic_actions, ''), (basic_actions, ''),
(specific_actions, _('Send specific format to')),
(delete_actions, _('Send and delete from library')), (delete_actions, _('Send and delete from library')),
(specific_actions, _('Send specific format to'))
): ):
mdest = menu mdest = menu
if actions is not basic_actions: if actions is not basic_actions:
mdest = menu.addMenu(desc) mdest = QMenu(desc)
self._memory.append(mdest) self._memory.append(mdest)
later_menus.append(mdest)
if menu is self.set_default_menu:
menu.addMenu(mdest)
menu.addSeparator()
for dest, delete, specific, icon, text in actions: for dest, delete, specific, icon, text in actions:
action = DeviceAction(dest, delete, specific, icon, text, self) action = DeviceAction(dest, delete, specific, icon, text, self)
@ -507,7 +512,7 @@ class DeviceMenu(QMenu): # {{{
action.a_s.connect(self.action_triggered) action.a_s.connect(self.action_triggered)
self.actions.append(action) self.actions.append(action)
mdest.addAction(action) mdest.addAction(action)
if actions is not specific_actions: if actions is basic_actions:
menu.addSeparator() menu.addSeparator()
da = config['default_send_to_device_action'] da = config['default_send_to_device_action']
@ -525,14 +530,21 @@ class DeviceMenu(QMenu): # {{{
self.group.triggered.connect(self.change_default_action) self.group.triggered.connect(self.change_default_action)
self.addSeparator() self.addSeparator()
self.addMenu(later_menus[0])
self.addSeparator()
mitem = self.addAction(QIcon(I('eject.png')), _('Eject device')) mitem = self.addAction(QIcon(I('eject.png')), _('Eject device'))
mitem.setEnabled(False) mitem.setEnabled(False)
mitem.triggered.connect(lambda x : self.disconnect_mounted_device.emit()) mitem.triggered.connect(lambda x : self.disconnect_mounted_device.emit())
self.disconnect_mounted_device_action = mitem self.disconnect_mounted_device_action = mitem
self.addSeparator() self.addSeparator()
self.addMenu(self.set_default_menu) self.addMenu(self.set_default_menu)
self.addSeparator() self.addSeparator()
self.addMenu(later_menus[1])
self.addSeparator()
annot = self.addAction(_('Fetch annotations (experimental)')) annot = self.addAction(_('Fetch annotations (experimental)'))
annot.setEnabled(False) annot.setEnabled(False)
annot.triggered.connect(lambda x : annot.triggered.connect(lambda x :

View File

@ -11,11 +11,15 @@ from calibre.constants import preferred_encoding
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \
CData, Comment, Declaration, ProcessingInstruction CData, Comment, Declaration, ProcessingInstruction
from calibre import prepare_string_for_xml from calibre import prepare_string_for_xml
from calibre.utils.html2text import html2text
from calibre.ebooks.markdown import markdown
# Hackish - ignoring sentences ending or beginning in numbers to avoid # Hackish - ignoring sentences ending or beginning in numbers to avoid
# confusion with decimal points. # confusion with decimal points.
lost_cr_pat = re.compile('([a-z])([\.\?!])([A-Z])') lost_cr_pat = re.compile('([a-z])([\.\?!])([A-Z])')
lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])') lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])')
sanitize_pat = re.compile(r'<script|<table|<tr|<td|<th|<style|<iframe',
re.IGNORECASE)
def comments_to_html(comments): def comments_to_html(comments):
''' '''
@ -53,6 +57,15 @@ def comments_to_html(comments):
for x in comments.split('\n\n')] for x in comments.split('\n\n')]
return '\n'.join(parts) return '\n'.join(parts)
if sanitize_pat.search(comments) is not None:
try:
return sanitize_comments_html(comments)
except:
import traceback
traceback.print_exc()
return u'<p></p>'
# Explode lost CRs to \n\n # Explode lost CRs to \n\n
comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
'.\r'), comments) '.\r'), comments)
@ -115,6 +128,11 @@ def comments_to_html(comments):
return result.renderContents(encoding=None) return result.renderContents(encoding=None)
def sanitize_comments_html(html):
text = html2text(html)
md = markdown.Markdown(safe_mode=True)
return md.convert(text)
def test(): def test():
for pat, val in [ for pat, val in [
('lineone\n\nlinetwo', ('lineone\n\nlinetwo',

View File

@ -119,7 +119,8 @@ class LibraryServer(ContentServer, MobileServer, XMLServer, OPDSServer, Cache,
cherrypy.config.update({ cherrypy.config.update({
'log.screen' : opts.develop, 'log.screen' : opts.develop,
'engine.autoreload_on' : opts.develop, 'engine.autoreload_on' : getattr(opts,
'auto_reload', False),
'tools.log_headers.on' : opts.develop, 'tools.log_headers.on' : opts.develop,
'checker.on' : opts.develop, 'checker.on' : opts.develop,
'request.show_tracebacks': show_tracebacks, 'request.show_tracebacks': show_tracebacks,

View File

@ -123,9 +123,10 @@ def get_category_items(category, items, restriction, datatype, prefix): # {{{
def item(i): def item(i):
templ = (u'<div title="{4}" class="category-item">' templ = (u'<div title="{4}" class="category-item">'
'<div class="category-name">{0}</div><div>{1}</div>' '<div class="category-name">'
'<div>{2}' '<a href="{5}{3}" title="{4}">{0}</a></div>'
'<span class="href">{5}{3}</span></div></div>') '<div>{1}</div>'
'<div>{2}</div></div>')
rating, rstring = render_rating(i.avg_rating, prefix) rating, rstring = render_rating(i.avg_rating, prefix)
name = xml(i.name) name = xml(i.name)
if datatype == 'rating': if datatype == 'rating':
@ -142,7 +143,7 @@ def get_category_items(category, items, restriction, datatype, prefix): # {{{
q = category q = category
href = '/browse/matches/%s/%s'%(quote(q), quote(id_)) href = '/browse/matches/%s/%s'%(quote(q), quote(id_))
return templ.format(xml(name), rating, return templ.format(xml(name), rating,
xml(desc), xml(href), rstring, prefix) xml(desc), xml(href, True), rstring, prefix)
items = list(map(item, items)) items = list(map(item, items))
return '\n'.join(['<div class="category-container">'] + items + ['</div>']) return '\n'.join(['<div class="category-container">'] + items + ['</div>'])
@ -252,8 +253,6 @@ class BrowseServer(object):
lp = self.db.library_path lp = self.db.library_path
if isbytestring(lp): if isbytestring(lp):
lp = force_unicode(lp, filesystem_encoding) lp = force_unicode(lp, filesystem_encoding)
if isinstance(ans, unicode):
ans = ans.encode('utf-8')
ans = ans.replace('{library_name}', xml(os.path.basename(lp))) ans = ans.replace('{library_name}', xml(os.path.basename(lp)))
ans = ans.replace('{library_path}', xml(lp, True)) ans = ans.replace('{library_path}', xml(lp, True))
ans = ans.replace('{initial_search}', initial_search) ans = ans.replace('{initial_search}', initial_search)
@ -335,9 +334,10 @@ class BrowseServer(object):
icon = 'blank.png' icon = 'blank.png'
cats.append((meta['name'], category, icon)) cats.append((meta['name'], category, icon))
cats = [('<li title="{2} {0}"><img src="{3}{src}" alt="{0}" />' cats = [('<li><a title="{2} {0}" href="/browse/category/{1}">&nbsp;</a>'
'<img src="{3}{src}" alt="{0}" />'
'<span class="label">{0}</span>' '<span class="label">{0}</span>'
'<span class="url">{3}/browse/category/{1}</span></li>') '</li>')
.format(xml(x, True), xml(quote(y)), xml(_('Browse books by')), .format(xml(x, True), xml(quote(y)), xml(_('Browse books by')),
self.opts.url_prefix, src='/browse/icon/'+z) self.opts.url_prefix, src='/browse/icon/'+z)
for x, y, z in cats] for x, y, z in cats]
@ -393,14 +393,15 @@ class BrowseServer(object):
for x in sorted(starts): for x in sorted(starts):
category_groups[x] = len([y for y in items if category_groups[x] = len([y for y in items if
getter(y).upper().startswith(x)]) getter(y).upper().startswith(x)])
items = [(u'<h3 title="{0}">{0} <span>[{2}]</span></h3><div>' items = [(u'<h3 title="{0}"><a class="load_href" title="{0}"'
u' href="{4}{3}"><strong>{0}</strong> [{2}]</a></h3><div>'
u'<div class="loaded" style="display:none"></div>' u'<div class="loaded" style="display:none"></div>'
u'<div class="loading"><img alt="{1}" src="{4}/static/loading.gif" /><em>{1}</em></div>' u'<div class="loading"><img alt="{1}" src="{4}/static/loading.gif" /><em>{1}</em></div>'
u'<span class="load_href">{4}{3}</span></div>').format( u'</div>').format(
xml(s, True), xml(s, True),
xml(_('Loading, please wait'))+'&hellip;', xml(_('Loading, please wait'))+'&hellip;',
unicode(c), unicode(c),
xml(u'/browse/category_group/%s/%s'%(category, s)), xml(u'/browse/category_group/%s/%s'%(category, s), True),
self.opts.url_prefix) self.opts.url_prefix)
for s, c in category_groups.items()] for s, c in category_groups.items()]
items = '\n\n'.join(items) items = '\n\n'.join(items)
@ -563,7 +564,8 @@ class BrowseServer(object):
if not val: if not val:
val = '' val = ''
args[key] = xml(val, True) args[key] = xml(val, True)
fname = ascii_filename(args['title']) + ' - ' + ascii_filename(args['authors']) fname = quote(ascii_filename(args['title']) + ' - ' +
ascii_filename(args['authors']))
return args, fmt, fmts, fname return args, fmt, fmts, fname
@Endpoint(mimetype='application/json; charset=utf-8') @Endpoint(mimetype='application/json; charset=utf-8')

View File

@ -70,10 +70,10 @@ class ContentServer(object):
id = id.rpartition('_')[-1].partition('.')[0] id = id.rpartition('_')[-1].partition('.')[0]
match = re.search(r'\d+', id) match = re.search(r'\d+', id)
if not match: if not match:
raise cherrypy.HTTPError(400, 'id:%s not an integer'%id) raise cherrypy.HTTPError(404, 'id:%s not an integer'%id)
id = int(match.group()) id = int(match.group())
if not self.db.has_id(id): if not self.db.has_id(id):
raise cherrypy.HTTPError(400, 'id:%d does not exist in database'%id) raise cherrypy.HTTPError(404, 'id:%d does not exist in database'%id)
if what == 'thumb' or what.startswith('thumb_'): if what == 'thumb' or what.startswith('thumb_'):
try: try:
width, height = map(int, what.split('_')[1:]) width, height = map(int, what.split('_')[1:])

View File

@ -58,6 +58,9 @@ The OPDS interface is advertised via BonJour automatically.
help=_('Specifies a restriction to be used for this invocation. ' help=_('Specifies a restriction to be used for this invocation. '
'This option overrides any per-library settings specified' 'This option overrides any per-library settings specified'
' in the GUI')) ' in the GUI'))
parser.add_option('--auto-reload', default=False, action='store_true',
help=_('Auto reload server when source code changes. May not'
' work in all environments.'))
return parser return parser

View File

@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
import re, os import re, os
import __builtin__ import __builtin__
from urllib import quote
import cherrypy import cherrypy
from lxml import html from lxml import html
@ -115,8 +116,8 @@ def build_index(books, num, search, sort, order, start, total, url_base, CKEYS,
data = TD() data = TD()
for fmt in book['formats'].split(','): for fmt in book['formats'].split(','):
a = ascii_filename(book['authors']) a = quote(ascii_filename(book['authors']))
t = ascii_filename(book['title']) t = quote(ascii_filename(book['title']))
s = SPAN( s = SPAN(
A( A(
fmt.lower(), fmt.lower(),

View File

@ -0,0 +1,452 @@
#!/usr/bin/env python
"""html2text: Turn HTML into equivalent Markdown-structured text."""
__version__ = "2.39"
__author__ = "Aaron Swartz (me@aaronsw.com)"
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
# TODO:
# Support decoded entities with unifiable.
if not hasattr(__builtins__, 'True'): True, False = 1, 0
import re, sys, urllib, htmlentitydefs, codecs
import sgmllib
import urlparse
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
try: from textwrap import wrap
except: pass
# Use Unicode characters instead of their ascii psuedo-replacements
UNICODE_SNOB = 1
# Put the links after each paragraph instead of at the end.
LINKS_EACH_PARAGRAPH = 0
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
BODY_WIDTH = 0
# Don't show internal links (href="#local-anchor") -- corresponding link targets
# won't be visible in the plain text file anyway.
SKIP_INTERNAL_LINKS = True
### Entity Nonsense ###
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
unifiable_n = {}
for k in unifiable.keys():
unifiable_n[name2cp(k)] = unifiable[k]
def charref(name):
if name[0] in ['x','X']:
c = int(name[1:], 16)
else:
c = int(name)
if not UNICODE_SNOB and c in unifiable_n.keys():
return unifiable_n[c]
else:
return unichr(c)
def entityref(c):
if not UNICODE_SNOB and c in unifiable.keys():
return unifiable[c]
else:
try: name2cp(c)
except KeyError: return "&" + c
else: return unichr(name2cp(c))
def replaceEntities(s):
s = s.group(1)
if s[0] == "#":
return charref(s[1:])
else: return entityref(s)
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape(s):
return r_unescape.sub(replaceEntities, s)
def fixattrs(attrs):
# Fix bug in sgmllib.py
if not attrs: return attrs
newattrs = []
for attr in attrs:
newattrs.append((attr[0], unescape(attr[1])))
return newattrs
### End Entity Nonsense ###
def onlywhite(line):
"""Return true if the line does only consist of whitespace characters."""
for c in line:
if c is not ' ' and c is not ' ':
return c is ' '
return line
def optwrap(text):
"""Wrap all paragraphs in the provided text."""
if not BODY_WIDTH:
return text
assert wrap, "Requires Python 2.3."
result = ''
newlines = 0
for para in text.split("\n"):
if len(para) > 0:
if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
for line in wrap(para, BODY_WIDTH):
result += line + "\n"
result += "\n"
newlines = 2
else:
if not onlywhite(para):
result += para + "\n"
newlines = 1
else:
if newlines < 2:
result += "\n"
newlines += 1
return result
def hn(tag):
if tag[0] == 'h' and len(tag) == 2:
try:
n = int(tag[1])
if n in range(1, 10): return n
except ValueError: return 0
class _html2text(sgmllib.SGMLParser):
def __init__(self, out=None, baseurl=''):
sgmllib.SGMLParser.__init__(self)
if out is None: self.out = self.outtextf
else: self.out = out
self.outtext = u''
self.quiet = 0
self.p_p = 0
self.outcount = 0
self.start = 1
self.space = 0
self.a = []
self.astack = []
self.acount = 0
self.list = []
self.blockquote = 0
self.pre = 0
self.startpre = 0
self.lastWasNL = 0
self.abbr_title = None # current abbreviation definition
self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later
self.baseurl = baseurl
def outtextf(self, s):
self.outtext += s
def close(self):
sgmllib.SGMLParser.close(self)
self.pbr()
self.o('', 0, 'end')
return self.outtext
def handle_charref(self, c):
self.o(charref(c))
def handle_entityref(self, c):
self.o(entityref(c))
def unknown_starttag(self, tag, attrs):
self.handle_tag(tag, attrs, 1)
def unknown_endtag(self, tag):
self.handle_tag(tag, None, 0)
def previousIndex(self, attrs):
""" returns the index of certain set of attributes (of a link) in the
self.a list
If the set of attributes is not found, returns None
"""
if not attrs.has_key('href'): return None
i = -1
for a in self.a:
i += 1
match = 0
if a.has_key('href') and a['href'] == attrs['href']:
if a.has_key('title') or attrs.has_key('title'):
if (a.has_key('title') and attrs.has_key('title') and
a['title'] == attrs['title']):
match = True
else:
match = True
if match: return i
def handle_tag(self, tag, attrs, start):
attrs = fixattrs(attrs)
if hn(tag):
self.p()
if start: self.o(hn(tag)*"#" + ' ')
if tag in ['p', 'div']: self.p()
if tag == "br" and start: self.o(" \n")
if tag == "hr" and start:
self.p()
self.o("* * *")
self.p()
if tag in ["head", "style", 'script']:
if start: self.quiet += 1
else: self.quiet -= 1
if tag in ["body"]:
self.quiet = 0 # sites like 9rules.com never close <head>
if tag == "blockquote":
if start:
self.p(); self.o('> ', 0, 1); self.start = 1
self.blockquote += 1
else:
self.blockquote -= 1
self.p()
if tag in ['em', 'i', 'u']: self.o("_")
if tag in ['strong', 'b']: self.o("**")
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
if tag == "abbr":
if start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
self.abbr_title = None
self.abbr_data = ''
if attrs.has_key('title'):
self.abbr_title = attrs['title']
else:
if self.abbr_title != None:
self.abbr_list[self.abbr_data] = self.abbr_title
self.abbr_title = None
self.abbr_data = ''
if tag == "a":
if start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
self.astack.append(attrs)
self.o("[")
else:
self.astack.append(None)
else:
if self.astack:
a = self.astack.pop()
if a:
i = self.previousIndex(a)
if i is not None:
a = self.a[i]
else:
self.acount += 1
a['count'] = self.acount
a['outcount'] = self.outcount
self.a.append(a)
self.o("][" + `a['count']` + "]")
if tag == "img" and start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
if attrs.has_key('src'):
attrs['href'] = attrs['src']
alt = attrs.get('alt', '')
i = self.previousIndex(attrs)
if i is not None:
attrs = self.a[i]
else:
self.acount += 1
attrs['count'] = self.acount
attrs['outcount'] = self.outcount
self.a.append(attrs)
self.o("![")
self.o(alt)
self.o("]["+`attrs['count']`+"]")
if tag == 'dl' and start: self.p()
if tag == 'dt' and not start: self.pbr()
if tag == 'dd' and start: self.o(' ')
if tag == 'dd' and not start: self.pbr()
if tag in ["ol", "ul"]:
if start:
self.list.append({'name':tag, 'num':0})
else:
if self.list: self.list.pop()
self.p()
if tag == 'li':
if start:
self.pbr()
if self.list: li = self.list[-1]
else: li = {'name':'ul', 'num':0}
self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
if li['name'] == "ul": self.o("* ")
elif li['name'] == "ol":
li['num'] += 1
self.o(`li['num']`+". ")
self.start = 1
else:
self.pbr()
if tag in ["table", "tr"] and start: self.p()
if tag == 'td': self.pbr()
if tag == "pre":
if start:
self.startpre = 1
self.pre = 1
else:
self.pre = 0
self.p()
def pbr(self):
if self.p_p == 0: self.p_p = 1
def p(self): self.p_p = 2
def o(self, data, puredata=0, force=0):
if self.abbr_data is not None: self.abbr_data += data
if not self.quiet:
if puredata and not self.pre:
data = re.sub('\s+', ' ', data)
if data and data[0] == ' ':
self.space = 1
data = data[1:]
if not data and not force: return
if self.startpre:
#self.out(" :") #TODO: not output when already one there
self.startpre = 0
bq = (">" * self.blockquote)
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
if self.pre:
bq += " "
data = data.replace("\n", "\n"+bq)
if self.start:
self.space = 0
self.p_p = 0
self.start = 0
if force == 'end':
# It's the end.
self.p_p = 0
self.out("\n")
self.space = 0
if self.p_p:
self.out(('\n'+bq)*self.p_p)
self.space = 0
if self.space:
if not self.lastWasNL: self.out(' ')
self.space = 0
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
if force == "end": self.out("\n")
newa = []
for link in self.a:
if self.outcount > link['outcount']:
self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
if link.has_key('title'): self.out(" ("+link['title']+")")
self.out("\n")
else:
newa.append(link)
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
self.a = newa
if self.abbr_list and force == "end":
for abbr, definition in self.abbr_list.items():
self.out(" *[" + abbr + "]: " + definition + "\n")
self.p_p = 0
self.out(data)
self.lastWasNL = data and data[-1] == '\n'
self.outcount += 1
def handle_data(self, data):
if r'\/script>' in data: self.quiet -= 1
self.o(data, 1)
def unknown_decl(self, data): pass
def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
def html2text_file(html, out=wrapwrite, baseurl=''):
h = _html2text(out, baseurl)
h.feed(html)
h.feed("")
return h.close()
def html2text(html, baseurl=''):
return optwrap(html2text_file(html, None, baseurl))
if __name__ == "__main__":
baseurl = ''
if sys.argv[1:]:
arg = sys.argv[1]
if arg.startswith('http://') or arg.startswith('https://'):
baseurl = arg
j = urllib.urlopen(baseurl)
try:
from feedparser import _getCharacterEncoding as enc
enc
except ImportError:
enc = lambda x, y: ('utf-8', 1)
text = j.read()
encoding = enc(j.headers, text)[0]
if encoding == 'us-ascii': encoding = 'utf-8'
data = text.decode(encoding)
else:
encoding = 'utf8'
if len(sys.argv) > 2:
encoding = sys.argv[2]
data = open(arg, 'r').read().decode(encoding)
else:
data = sys.stdin.read().decode('utf8')
wrapwrite(html2text(data, baseurl))