Option for overriding encoding detection

This commit is contained in:
Kovid Goyal 2008-02-28 04:24:17 +00:00
parent ec79457d45
commit 38a813953f
14 changed files with 221 additions and 59 deletions

View File

@ -229,6 +229,9 @@ def option_parser(usage, gui_mode=False):
help='Convert to LRS', default=False) help='Convert to LRS', default=False)
parser.add_option('--minimize-memory-usage', action='store_true', default=False, parser.add_option('--minimize-memory-usage', action='store_true', default=False,
help=_('Minimize memory usage at the cost of longer processing times. Use this option if you are on a memory constrained machine.')) help=_('Minimize memory usage at the cost of longer processing times. Use this option if you are on a memory constrained machine.'))
parser.add_option('--encoding', default=None,
help='Specify the character encoding of the source file. If the output LRF file contains strange characters, try changing this option. A common encoding for files from windows computers is cp-1252. Another common choice is utf-8. The default is to try and guess the encoding.')
return parser return parser
def find_custom_fonts(options, logger): def find_custom_fonts(options, logger):

View File

@ -359,6 +359,8 @@ class HTMLConverter(object):
raw = f.read() raw = f.read()
if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
raw = raw.decode('utf-8', 'ignore') raw = raw.decode('utf-8', 'ignore')
elif self.encoding is not None:
raw = raw.decode(self.encoding, 'ignore')
else: else:
raw = xml_to_unicode(raw, self.verbose)[0] raw = xml_to_unicode(raw, self.verbose)[0]
f.close() f.close()

View File

@ -74,11 +74,10 @@ def process_file(path, options, logger=None):
logger = logging.getLogger('txt2lrf') logger = logging.getLogger('txt2lrf')
setup_cli_handlers(logger, level) setup_cli_handlers(logger, level)
txt = os.path.abspath(os.path.expanduser(path)) txt = os.path.abspath(os.path.expanduser(path))
if not hasattr(options, 'encoding'):
options.encoding = None
if not hasattr(options, 'debug_html_generation'): if not hasattr(options, 'debug_html_generation'):
options.debug_html_generation = False options.debug_html_generation = False
htmlfile = generate_html(txt, options.encoding, logger) htmlfile = generate_html(txt, options.encoding, logger)
options.encoding = 'utf-8'
if not options.debug_html_generation: if not options.debug_html_generation:
options.force_page_break = 'h2' options.force_page_break = 'h2'
if not options.output: if not options.output:

View File

@ -12,7 +12,7 @@
## You should have received a copy of the GNU General Public License along ## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Convert known websites into LRF files.''' '''Convert websites into LRF files.'''
import sys, time, tempfile, shutil, os, logging, imp, inspect, re import sys, time, tempfile, shutil, os, logging, imp, inspect, re
from urlparse import urlsplit from urlparse import urlsplit
@ -54,7 +54,6 @@ def option_parser():
help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout) help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout)
parser.add_option('--dont-download-stylesheets', action='store_true', default=None, parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
help='Do not download CSS stylesheets.', dest='no_stylesheets') help='Do not download CSS stylesheets.', dest='no_stylesheets')
parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append', parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append',
help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.') help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps', parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
@ -64,7 +63,7 @@ def option_parser():
return parser return parser
def fetch_website(options, logger): def fetch_website(options, logger):
tdir = tempfile.mkdtemp(prefix=__appname__+'_' ) tdir = tempfile.mkdtemp(prefix=__appname__+'_', suffix='_web2lrf')
options.dir = tdir options.dir = tdir
fetcher = create_fetcher(options, logger) fetcher = create_fetcher(options, logger)
fetcher.preprocess_regexps = options.preprocess_regexps fetcher.preprocess_regexps = options.preprocess_regexps
@ -147,10 +146,13 @@ def process_profile(args, options, logger=None):
options.preprocess_regexps = profile.preprocess_regexps options.preprocess_regexps = profile.preprocess_regexps
options.filter_regexps += profile.filter_regexps options.filter_regexps += profile.filter_regexps
options.encoding = profile.encoding if options.encoding is None else options.encoding
if len(args) == 2 and args[1] != 'default': if len(args) == 2 and args[1] != 'default':
options.anchor_ids = False options.anchor_ids = False
htmlfile, tdir = fetch_website(options, logger) htmlfile, tdir = fetch_website(options, logger)
options.encoding = 'utf-8'
cwd = os.getcwdu() cwd = os.getcwdu()
if not options.output: if not options.output:
options.output = os.path.join(cwd, options.title+('.lrs' if options.lrs else '.lrf')) options.output = os.path.join(cwd, options.title+('.lrs' if options.lrs else '.lrf'))

View File

@ -13,6 +13,8 @@
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
''' '''
Contains the Base Profiles that can be used to easily create profiles to download
particular websites.
''' '''
import tempfile, time, calendar, re, operator, atexit, shutil, os import tempfile, time, calendar, re, operator, atexit, shutil, os
@ -24,36 +26,120 @@ from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString,
class DefaultProfile(object): class DefaultProfile(object):
url = '' # The URL of the website #: The title to use for the LRF file
title = 'Default Profile' # The title to use for the LRF file #: @type: string
max_articles_per_feed = 10 # Maximum number of articles to download from each feed title = 'Default Profile'
html_description = True # If True process the <description> element of the feed as HTML
oldest_article = 7 # How many days old should the oldest article downloaded from the feeds be?
max_recursions = 1 # Number of levels of links to follow
max_files = 3000 # Maximum number of files to download
delay = 0 # Delay between consecutive downloads
timeout = 10 # Timeout for fetching files from server in seconds
timefmt = ' [%a %d %b %Y]' # The format of the date shown on the first page
url_search_order = ['guid', 'link'] # The order of elements to search for a URL when parssing the RSS feed
pubdate_fmt = None # The format string used to parse the publication date in the RSS feed. If set to None some default heuristics are used, these may fail, in which case set this to the correct string or re-implement strptime in your subclass.
use_pubdate = True, # If True will look for a publication date for each article. If False assumes the publication date is the current time.
summary_length = 500 # Max number of characters in the short description (ignored in DefaultProfile)
no_stylesheets = False # Download stylesheets only if False
allow_duplicates = False # If False articles with the same title in the same feed are not downloaded multiple times
needs_subscription = False # If True the GUI will ask the userfor a username and password to use while downloading
match_regexps = [] # List of regular expressions that determines which links to follow
filter_regexps = [] # List of regular expressions that determines which links to ignore
# Only one of match_regexps or filter_regexps should be defined
html2lrf_options = [] # List of options to pass to html2lrf #: Maximum number of articles to download from each feed
# List of regexp substitution rules to run on the downloaded HTML. Each element of the #: @type: integer
# list should be a two element tuple. The first element of the tuple should max_articles_per_feed = 10
# be a compiled regular expression and the second a callable that takes
# a single match object and returns a string to replace the match. #: If True process the <description> element of the feed as HTML
#: @type: boolean
html_description = True
#: How many days old should the oldest article downloaded from the feeds be
#: @type: integer
oldest_article = 7
#: Recommend frequency at which to download this profile. In days.
recommended_frequency = 7
#: Number of levels of links to follow
#: @type: integer
max_recursions = 1
#: Maximum number of files to download
#: @type: integer
max_files = 3000
#: Delay between consecutive downloads in seconds
#: @type: integer
delay = 0
#: Timeout for fetching files from server in seconds
#: @type: integer
timeout = 10
#: The format string for the date shown on the first page
#: @type: string
timefmt = ' [%a %d %b %Y]'
#: The order of elements to search for a URL when parsing the RSS feed. You
#: can replace these elements by completely arbitrary elements to customize
#: feed processing.
#: @type: list of strings
url_search_order = ['guid', 'link']
#: The format string used to parse the publication date in the RSS feed.
#: If set to None some default heuristics are used, these may fail,
#: in which case set this to the correct string or re-implement
#: L{DefaultProfile.strptime} in your subclass.
#: @type: string or None
pubdate_fmt = None
#: If True will look for a publication date for each article.
#: If False assumes the publication date is the current time.
#: @type: boolean
use_pubdate = True,
#: Max number of characters in the short description.
#: Used by L{FullContentProfile}
#: @type: integer
summary_length = 500
#: If True stylesheets are not downloaded and processed
#: Convenient flag to disable loading of stylesheets for websites
#: that have overly complex stylesheets unsuitable for conversion
#: to ebooks formats
#: @type: boolean
no_stylesheets = False
#: If False articles with the same title in the same feed
#: are not downloaded multiple times
#: @type: boolean
allow_duplicates = False
#: If True the GUI will ask the user for a username and password
#: to use while downloading
#: @type: boolean
needs_subscription = False
#: Specify an override encoding for sites that have an incorrect
#: charset specification. THe most common being specifying latin1 and
#: using cp1252
encoding = None
#: List of regular expressions that determines which links to follow
#: If empty, it is ignored.
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
#: @type: list of strings
match_regexps = []
#: List of regular expressions that determines which links to ignore
#: If empty it is ignored
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
#: @type: list of strings
filter_regexps = []
#: List of options to pass to html2lrf, to customize conversion
#: to LRF
#: @type: list of strings
html2lrf_options = []
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the
#: list should be a two element tuple. The first element of the tuple should
#: be a compiled regular expression and the second a callable that takes
#: a single match object and returns a string to replace the match.
#: @type: list of tuples
preprocess_regexps = [] preprocess_regexps = []
# See the built-in profiles for examples of these settings. # See the built-in profiles for examples of these settings.
#: The URL of the website
#: @type: string
url = ''
feeds = [] feeds = []
CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL) CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL)
@ -84,9 +170,7 @@ class DefaultProfile(object):
''' '''
return browser() return browser()
########################################################################
###################### End of customizable portion #####################
########################################################################
def __init__(self, logger, verbose=False, username=None, password=None): def __init__(self, logger, verbose=False, username=None, password=None):
@ -105,14 +189,14 @@ class DefaultProfile(object):
def build_index(self): def build_index(self):
'''Build an RSS based index.html''' '''Build an RSS based index.html'''
articles = self.parse_feeds() articles = self.parse_feeds()
encoding = 'utf-8' if self.encoding is None else self.encoding
def build_sub_index(title, items): def build_sub_index(title, items):
ilist = '' ilist = ''
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\ li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n' u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
for item in items: for item in items:
if not item.has_key('date'): if not item.has_key('date'):
item['date'] = time.ctime() item['date'] = time.strftime('%a, %d %b', time.localtime())
ilist += li%item ilist += li%item
return u'''\ return u'''\
<html> <html>
@ -135,7 +219,7 @@ class DefaultProfile(object):
prefix = 'file:' if iswindows else '' prefix = 'file:' if iswindows else ''
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category) clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
src = build_sub_index(category, articles[category]) src = build_sub_index(category, articles[category])
open(cfile, 'wb').write(src.encode('utf-8')) open(cfile, 'wb').write(src.encode(encoding))
src = '''\ src = '''\
<html> <html>
@ -150,7 +234,8 @@ class DefaultProfile(object):
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),
categories=clist, title=self.title) categories=clist, title=self.title)
index = os.path.join(self.temp_dir, 'index.html') index = os.path.join(self.temp_dir, 'index.html')
open(index, 'wb').write(src.encode('utf-8')) open(index, 'wb').write(src.encode(encoding))
return index return index
@ -160,7 +245,9 @@ class DefaultProfile(object):
Convenience method to take a BeautifulSoup Tag and extract the text from it Convenience method to take a BeautifulSoup Tag and extract the text from it
recursively, including any CDATA sections and alt tag attributes. recursively, including any CDATA sections and alt tag attributes.
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content @param use_alt: If True try to use the alt attribute for tags that don't have any textual content
@type use_alt: boolean
@return: A unicode (possibly empty) object @return: A unicode (possibly empty) object
@rtype: unicode string
''' '''
if not tag: if not tag:
return '' return ''
@ -181,11 +268,13 @@ class DefaultProfile(object):
def get_article_url(self, item): def get_article_url(self, item):
''' '''
Return the article URL given an item Tag from a feed, or None if no valid URL is found Return the article URL given an item Tag from a feed, or None if no valid URL is found
@param: A BeautifulSoup Tag instance corresponding to the <item> tag from a feed. @type item: BeatifulSoup.Tag
@param item: A BeautifulSoup Tag instance corresponding to the <item> tag from a feed.
@rtype: string or None
''' '''
url = None url = None
for element in self.url_search_order: for element in self.url_search_order:
url = item.find(element) url = item.find(element.lower())
if url: if url:
break break
return url return url
@ -195,14 +284,16 @@ class DefaultProfile(object):
''' '''
Create list of articles from a list of feeds. Create list of articles from a list of feeds.
@param require_url: If True skip articles that don't have a link to a HTML page with the full article contents. @param require_url: If True skip articles that don't have a link to a HTML page with the full article contents.
@type require_url: boolean
@rtype: dictionary
@return: A dictionary whose keys are feed titles and whose values are each @return: A dictionary whose keys are feed titles and whose values are each
a list of dictionaries. Each list contains dictionaries of the form: a list of dictionaries. Each list contains dictionaries of the form::
{ {
'title' : article title, 'title' : article title,
'url' : URL of print version, 'url' : URL of print version,
'date' : The publication date of the article as a string, 'date' : The publication date of the article as a string,
'description' : A summary of the article 'description' : A summary of the article
'content' : The full article (can be an empty string). This is unused in DefaultProfile 'content' : The full article (can be an empty string). This is used by FullContentProfile
} }
''' '''
added_articles = {} added_articles = {}
@ -299,6 +390,12 @@ class DefaultProfile(object):
@classmethod @classmethod
def process_html_description(cls, tag, strip_links=True): def process_html_description(cls, tag, strip_links=True):
'''
Process a <description> tag that contains HTML markup, either
entity encoded or escaped in a CDATA section.
@return: HTML
@rtype: string
'''
src = '\n'.join(tag.contents) if hasattr(tag, 'contents') else tag src = '\n'.join(tag.contents) if hasattr(tag, 'contents') else tag
match = cls.CDATA_PAT.match(src.lstrip()) match = cls.CDATA_PAT.match(src.lstrip())
if match: if match:
@ -325,7 +422,13 @@ class DefaultProfile(object):
def strptime(cls, src): def strptime(cls, src):
''' '''
Take a string and return the date that string represents, in UTC as Take a string and return the date that string represents, in UTC as
an epoch (i.e. number of seconds since Jan 1, 1970) an epoch (i.e. number of seconds since Jan 1, 1970). This function uses
a bunch of heuristics and is a prime candidate for being overridden in a
subclass.
@param src: Timestamp as a string
@type src: string
@return: time ans a epoch
@rtype: number
''' '''
delta = 0 delta = 0
zone = re.search(r'\s*(\+\d\d\:{0,1}\d\d)', src) zone = re.search(r'\s*(\+\d\d\:{0,1}\d\d)', src)
@ -376,7 +479,7 @@ class FullContentProfile(DefaultProfile):
def build_index(self): def build_index(self):
'''Build an RSS based index.html''' '''Build an RSS based index.html. '''
articles = self.parse_feeds(require_url=False) articles = self.parse_feeds(require_url=False)
def build_sub_index(title, items): def build_sub_index(title, items):
@ -468,3 +571,4 @@ def cleanup(tdir):
shutil.rmtree(tdir) shutil.rmtree(tdir)
except: except:
pass pass

View File

@ -34,6 +34,7 @@ class DeviceDetector(QThread):
self.devices = [[d, False] for d in devices()] self.devices = [[d, False] for d in devices()]
self.sleep_time = sleep_time self.sleep_time = sleep_time
QThread.__init__(self) QThread.__init__(self)
self.keep_going = True
def run(self): def run(self):
_wmi = None _wmi = None
@ -42,7 +43,7 @@ class DeviceDetector(QThread):
pythoncom.CoInitialize() pythoncom.CoInitialize()
_wmi = wmi.WMI() _wmi = wmi.WMI()
scanner = DeviceScanner(_wmi) scanner = DeviceScanner(_wmi)
while True: while self.keep_going:
scanner.scan() scanner.scan()
for device in self.devices: for device in self.devices:
connected = scanner.is_device_connected(device[0]) connected = scanner.is_device_connected(device[0])

View File

@ -12,7 +12,7 @@
## You should have received a copy of the GNU General Public License along ## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import os, cPickle import os, cPickle, codecs
from PyQt4.QtCore import QObject, SIGNAL, Qt, QSettings, QVariant, QByteArray from PyQt4.QtCore import QObject, SIGNAL, Qt, QSettings, QVariant, QByteArray
from PyQt4.QtGui import QAbstractSpinBox, QLineEdit, QCheckBox, QDialog, \ from PyQt4.QtGui import QAbstractSpinBox, QLineEdit, QCheckBox, QDialog, \
@ -315,6 +315,14 @@ class LRFSingleDialog(QDialog, Ui_LRFSingleDialog):
elif isinstance(obj, QLineEdit): elif isinstance(obj, QLineEdit):
val = qstring_to_unicode(obj.text()) val = qstring_to_unicode(obj.text())
if val: if val:
if opt == '--encoding':
try:
codecs.getdecoder(val)
except:
d = error_dialog(self, 'Unknown encoding',
'<p>Unknown encoding: %s<br/>For a list of known encodings see http://docs.python.org/lib/standard-encodings.html'%val)
d.exec_()
return
cmd.extend([opt, val]) cmd.extend([opt, val])
elif isinstance(obj, QTextEdit): elif isinstance(obj, QTextEdit):
val = qstring_to_unicode(obj.toPlainText()) val = qstring_to_unicode(obj.toPlainText())
@ -366,6 +374,8 @@ class LRFSingleDialog(QDialog, Ui_LRFSingleDialog):
def accept(self): def accept(self):
cmdline = self.build_commandline() cmdline = self.build_commandline()
if cmdline is None:
return
if self.db: if self.db:
self.cover_file = None self.cover_file = None
self.write_metadata() self.write_metadata()

View File

@ -559,6 +559,19 @@
<item row="2" column="1" colspan="2" > <item row="2" column="1" colspan="2" >
<widget class="QComboBox" name="gui_mono_family" /> <widget class="QComboBox" name="gui_mono_family" />
</item> </item>
<item row="3" column="0" >
<widget class="QLabel" name="label_26" >
<property name="text" >
<string>Source en&amp;coding:</string>
</property>
<property name="buddy" >
<cstring>gui_encoding</cstring>
</property>
</widget>
</item>
<item row="3" column="1" colspan="2" >
<widget class="QLineEdit" name="gui_encoding" />
</item>
</layout> </layout>
</widget> </widget>
</item> </item>

View File

@ -984,6 +984,10 @@ class Main(MainWindow, Ui_MainWindow):
e.ignore() e.ignore()
return return
self.write_settings() self.write_settings()
self.detector.keep_going = False
self.hide()
self.detector.wait(2000)
self.detector.terminate()
e.accept() e.accept()
def update_found(self, version): def update_found(self, version):

View File

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "-//W3C//DTD XHTML 1.0 Frameset//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" > <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head> <head>
<meta name="author" content="Kovid Goyal" /> <meta name="author" content="Kovid Goyal" />

View File

@ -80,11 +80,11 @@ def clean():
return 0 return 0
def compile_help(): def compile_help():
QTDIR = '/usr/local/Trolltech/Qt-4.4.0-tp1' QTDIR = '/usr/local/Trolltech/Qt-4.4.0-beta1'
QTBIN = QTDIR + '/bin' QTBIN = QTDIR + '/bin'
QTLIB = QTDIR + '/lib' QTLIB = QTDIR + '/lib'
QCG = os.path.join(QTBIN, 'qcollectiongenerator') QCG = os.path.join(QTBIN, 'qcollectiongenerator')
QTA = os.path.join(QTBIN, 'assistant_new') QTA = os.path.join(QTBIN, 'assistant')
os.environ['LD_LIBRARY_PATH'] = QTLIB os.environ['LD_LIBRARY_PATH'] = QTLIB
subprocess.check_call((QCG, 'libprs500.qhcp')) subprocess.check_call((QCG, 'libprs500.qhcp'))
subprocess.call((QTA, '-collectionFile', 'libprs500.qhc')) subprocess.call((QTA, '-collectionFile', 'libprs500.qhc'))
@ -156,7 +156,7 @@ def generate_cli_docs(src='libprs500.qhp'):
'<li><a href="cli-%s.html">%s</a></li>\n'%(i[0], i[0]) for i in documented_cmds) '<li><a href="cli-%s.html">%s</a></li>\n'%(i[0], i[0]) for i in documented_cmds)
body = '<h1 class="documentHeading">The Command Line Interface</h1>\n' body = '<h1 class="documentHeading">The Command Line Interface</h1>\n'
body += '<div style="text-align:center"><img src="images/cli.png" /></div>' body += '<div style="text-align:center"><img src="images/cli.png" alt="CLI" /></div>'
body += '<p>%s</p>\n'%'<b class="cmd">libprs500</b> has a very comprehensive command line interface to perform most operations that can be performed by the GUI.' body += '<p>%s</p>\n'%'<b class="cmd">libprs500</b> has a very comprehensive command line interface to perform most operations that can be performed by the GUI.'
body += '<h2 class="sectionHeading">Documented commands</h2>\n'+dc_html body += '<h2 class="sectionHeading">Documented commands</h2>\n'+dc_html
body += '<h2 class="sectionHeading">Undocumented commands</h2>\n'+uc_html body += '<h2 class="sectionHeading">Undocumented commands</h2>\n'+uc_html

View File

@ -10,6 +10,8 @@
#browser { #browser {
font-family: monospace; font-family: monospace;
} }
a { color: black; }
a:visited { color: black; }
.toplevel { .toplevel {
font-weight: bold; font-weight: bold;
} }

View File

@ -13,7 +13,8 @@
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
''' '''
Fetch a webpage and its links recursively. Fetch a webpage and its links recursively. The webpages are saved to disk in
UTF-8 encoding with any charset declarations removed.
''' '''
import sys, socket, os, urlparse, codecs, logging, re, time, copy, urllib2 import sys, socket, os, urlparse, codecs, logging, re, time, copy, urllib2
from urllib import url2pathname from urllib import url2pathname
@ -35,6 +36,9 @@ def basename(url):
return res return res
def save_soup(soup, target): def save_soup(soup, target):
for meta in soup.findAll('meta', content=True):
if 'charset' in meta['content']:
meta.extract()
f = codecs.open(target, 'w', 'utf8') f = codecs.open(target, 'w', 'utf8')
f.write(unicode(soup)) f.write(unicode(soup))
f.close() f.close()
@ -58,6 +62,7 @@ class RecursiveFetcher(object):
self.default_timeout = socket.getdefaulttimeout() self.default_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(options.timeout) socket.setdefaulttimeout(options.timeout)
self.verbose = options.verbose self.verbose = options.verbose
self.encoding = options.encoding
self.browser = options.browser if hasattr(options, 'browser') else browser() self.browser = options.browser if hasattr(options, 'browser') else browser()
self.max_recursions = options.max_recursions self.max_recursions = options.max_recursions
self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps] self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
@ -262,6 +267,11 @@ class RecursiveFetcher(object):
dsrc = f.read() dsrc = f.read()
if len(dsrc) == 0: if len(dsrc) == 0:
raise Exception('No content') raise Exception('No content')
if self.encoding is not None:
dsrc = dsrc.decode(self.encoding, 'ignore')
else:
dsrc = xml_to_unicode(dsrc)
soup = self.get_soup(dsrc) soup = self.get_soup(dsrc)
self.logger.debug('Processing images...') self.logger.debug('Processing images...')
self.process_images(soup, f.geturl()) self.process_images(soup, f.geturl())
@ -305,6 +315,8 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %default') help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %default')
parser.add_option('--delay', default=0, dest='delay', type='int', parser.add_option('--delay', default=0, dest='delay', type='int',
help='Minimum interval in seconds between consecutive fetches. Default is %default s') help='Minimum interval in seconds between consecutive fetches. Default is %default s')
parser.add_option('--encoding', default=None,
help='The character encoding for the websites you are trying to download. The default is to try and guess the encoding.')
parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps', parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.') help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps', parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',

View File

@ -10,6 +10,7 @@ import pysvn
PREFIX = "/var/www/vhosts/kovidgoyal.net/subdomains/libprs500" PREFIX = "/var/www/vhosts/kovidgoyal.net/subdomains/libprs500"
DOWNLOADS = PREFIX+"/httpdocs/downloads" DOWNLOADS = PREFIX+"/httpdocs/downloads"
DOCS = PREFIX+"/httpdocs/apidocs" DOCS = PREFIX+"/httpdocs/apidocs"
USER_MANUAL = PREFIX+'/httpdocs/user_manual'
HTML2LRF = "src/libprs500/ebooks/lrf/html/demo" HTML2LRF = "src/libprs500/ebooks/lrf/html/demo"
TXT2LRF = "src/libprs500/ebooks/lrf/txt/demo" TXT2LRF = "src/libprs500/ebooks/lrf/txt/demo"
check_call = partial(_check_call, shell=True) check_call = partial(_check_call, shell=True)
@ -108,7 +109,15 @@ def upload_docs():
check_call('''epydoc -v --config epydoc-pdf.conf''') check_call('''epydoc -v --config epydoc-pdf.conf''')
check_call('''scp docs/pdf/api.pdf castalia:%s/'''%(DOCS,)) check_call('''scp docs/pdf/api.pdf castalia:%s/'''%(DOCS,))
def upload_user_manual():
cwd = os.getcwdu()
os.chdir('src/libprs500/manual')
try:
check_call('python make.py')
check_call('ssh castalia rm -rf %s/\\*'%USER_MANUAL)
check_call('scp -r *.html styles images castalia:%s/'%USER_MANUAL)
finally:
os.chdir(cwd)
def main(): def main():
upload = len(sys.argv) < 2 upload = len(sys.argv) < 2
@ -128,6 +137,7 @@ def main():
print 'Uploading to PyPI' print 'Uploading to PyPI'
check_call('''python setup.py register bdist_egg --exclude-source-files upload''') check_call('''python setup.py register bdist_egg --exclude-source-files upload''')
upload_docs() upload_docs()
upload_user_manual()
check_call('''rm -rf dist/* build/*''') check_call('''rm -rf dist/* build/*''')
if __name__ == '__main__': if __name__ == '__main__':