diff --git a/src/libprs500/cli/__init__.py b/src/libprs500/cli/__init__.py deleted file mode 100644 index b20d1ac2d5..0000000000 --- a/src/libprs500/cli/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net -## This program is free software; you can redistribute it and/or modify -## it under the terms of the GNU General Public License as published by -## the Free Software Foundation; either version 2 of the License, or -## (at your option) any later version. -## -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. -## -## You should have received a copy of the GNU General Public License along -## with this program; if not, write to the Free Software Foundation, Inc., -## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -""" -Provides a command-line interface to the SONY Reader PRS-500. - -For usage information run the script. -""" -__docformat__ = "epytext" -__author__ = "Kovid Goyal " diff --git a/src/libprs500/cli/main.py b/src/libprs500/cli/main.py deleted file mode 100755 index 10ef8e4ccd..0000000000 --- a/src/libprs500/cli/main.py +++ /dev/null @@ -1,325 +0,0 @@ -## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net -## This program is free software; you can redistribute it and/or modify -## it under the terms of the GNU General Public License as published by -## the Free Software Foundation; either version 2 of the License, or -## (at your option) any later version. -## -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. -## -## You should have received a copy of the GNU General Public License along -## with this program; if not, write to the Free Software Foundation, Inc., -## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -""" -Provides a command-line and optional graphical interface to the SONY Reader PRS-500. - -For usage information run the script. -""" - -import StringIO, sys, time, os -from optparse import OptionParser - -from libprs500 import __version__ as VERSION -from libprs500.prs500 import PRS500 -from libprs500.cli.terminfo import TerminalController -from libprs500.errors import ArgumentError, DeviceError, DeviceLocked - - -MINIMUM_COL_WIDTH = 12 #: Minimum width of columns in ls output - -def human_readable(size): - """ Convert a size in bytes into a human readle form """ - if size < 1024: divisor, suffix = 1, "" - elif size < 1024*1024: divisor, suffix = 1024., "K" - elif size < 1024*1024*1024: divisor, suffix = 1024*1024, "M" - elif size < 1024*1024*1024*1024: divisor, suffix = 1024*1024, "G" - size = str(size/divisor) - if size.find(".") > -1: size = size[:size.find(".")+2] - return size + suffix - -class FileFormatter(object): - def __init__(self, file, term): - self.term = term - self.is_dir = file.is_dir - self.is_readonly = file.is_readonly - self.size = file.size - self.ctime = file.ctime - self.wtime = file.wtime - self.name = file.name - self.path = file.path - - @apply - def mode_string(): - doc=""" The mode string for this file. There are only two modes read-only and read-write """ - def fget(self): - mode, x = "-", "-" - if self.is_dir: mode, x = "d", "x" - if self.is_readonly: mode += "r-"+x+"r-"+x+"r-"+x - else: mode += "rw"+x+"rw"+x+"rw"+x - return mode - return property(doc=doc, fget=fget) - - @apply - def isdir_name(): - doc='''Return self.name + '/' if self is a directory''' - def fget(self): - name = self.name - if self.is_dir: - name += '/' - return name - return property(doc=doc, fget=fget) - - - @apply - def name_in_color(): - doc=""" The name in ANSI text. Directories are blue, ebooks are green """ - def fget(self): - cname = self.name - blue, green, normal = "", "", "" - if self.term: blue, green, normal = self.term.BLUE, self.term.GREEN, self.term.NORMAL - if self.is_dir: cname = blue + self.name + normal - else: - ext = self.name[self.name.rfind("."):] - if ext in (".pdf", ".rtf", ".lrf", ".lrx", ".txt"): cname = green + self.name + normal - return cname - return property(doc=doc, fget=fget) - - @apply - def human_readable_size(): - doc=""" File size in human readable form """ - def fget(self): - return human_readable(self.size) - return property(doc=doc, fget=fget) - - @apply - def modification_time(): - doc=""" Last modified time in the Linux ls -l format """ - def fget(self): - return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.wtime)) - return property(doc=doc, fget=fget) - - @apply - def creation_time(): - doc=""" Last modified time in the Linux ls -l format """ - def fget(self): - return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.ctime)) - return property(doc=doc, fget=fget) - -def info(dev): - info = dev.get_device_information() - print "Device name: ", info[0] - print "Device version: ", info[1] - print "Software version:", info[2] - print "Mime type: ", info[3] - -def ls(dev, path, term, recurse=False, color=False, human_readable_size=False, ll=False, cols=0): - def col_split(l, cols): # split list l into columns - rows = len(l) / cols - if len(l) % cols: - rows += 1 - m = [] - for i in range(rows): - m.append(l[i::rows]) - return m - - def row_widths(table): # Calculate widths for each column in the row-wise table - tcols = len(table[0]) - rowwidths = [ 0 for i in range(tcols) ] - for row in table: - c = 0 - for item in row: - rowwidths[c] = len(item) if len(item) > rowwidths[c] else rowwidths[c] - c += 1 - return rowwidths - - output = StringIO.StringIO() - if path.endswith("/"): path = path[:-1] - dirs = dev.list(path, recurse) - for dir in dirs: - if recurse: print >>output, dir[0] + ":" - lsoutput, lscoloutput = [], [] - files = dir[1] - maxlen = 0 - if ll: # Calculate column width for size column - for file in files: - size = len(str(file.size)) - if human_readable_size: - file = FileFormatter(file, term) - size = len(file.human_readable_size) - if size > maxlen: maxlen = size - for file in files: - file = FileFormatter(file, term) - name = file.name if ll else file.isdir_name - lsoutput.append(name) - if color: name = file.name_in_color - lscoloutput.append(name) - if ll: - size = str(file.size) - if human_readable_size: size = file.human_readable_size - print >>output, file.mode_string, ("%"+str(maxlen)+"s")%size, file.modification_time, name - if not ll and len(lsoutput) > 0: - trytable = [] - for colwidth in range(MINIMUM_COL_WIDTH, cols): - trycols = int(cols/colwidth) - trytable = col_split(lsoutput, trycols) - works = True - for row in trytable: - row_break = False - for item in row: - if len(item) > colwidth - 1: - works, row_break = False, True - break - if row_break: break - if works: break - rowwidths = row_widths(trytable) - trytablecol = col_split(lscoloutput, len(trytable[0])) - for r in range(len(trytable)): - for c in range(len(trytable[r])): - padding = rowwidths[c] - len(trytable[r][c]) - print >>output, trytablecol[r][c], "".ljust(padding), - print >>output - print >>output - listing = output.getvalue().rstrip()+ "\n" - output.close() - return listing - -def main(): - term = TerminalController() - cols = term.COLS - if not cols: # On windows terminal width is unknown - cols = 80 - - parser = OptionParser(usage="usage: %prog [options] command args\n\ncommand is one of: info, books, df, ls, cp, mkdir, touch, cat, rm\n\n"+ - "For help on a particular command: %prog command", version="libprs500 version: " + VERSION) - parser.add_option("--log-packets", help="print out packet stream to stdout. "+\ - "The numbers in the left column are byte offsets that allow the packet size to be read off easily.", - dest="log_packets", action="store_true", default=False) - parser.add_option("--unlock", help="Unlock device with KEY. For e.g. --unlock=1234", \ - dest='key', default='-1') - parser.remove_option("-h") - parser.disable_interspersed_args() # Allow unrecognized options - options, args = parser.parse_args() - - if len(args) < 1: - parser.print_help() - return 1 - - command = args[0] - args = args[1:] - dev = PRS500(key=options.key, log_packets=options.log_packets) - try: - if command == "df": - total = dev.total_space(end_session=False) - free = dev.free_space() - where = ("Memory", "Stick", "Card") - print "Filesystem\tSize \tUsed \tAvail \tUse%" - for i in range(3): - print "%-10s\t%s\t%s\t%s\t%s"%(where[i], human_readable(total[i]), human_readable(total[i]-free[i]), human_readable(free[i]),\ - str(0 if total[i]==0 else int(100*(total[i]-free[i])/(total[i]*1.)))+"%") - elif command == "books": - print "Books in main memory:" - for book in dev.books(): - print book - print "\nBooks on storage card:" - for book in dev.books(oncard=True): print book - elif command == "mkdir": - parser = OptionParser(usage="usage: %prog mkdir [options] path\nCreate a directory on the device\n\npath must begin with /,a:/ or b:/") - if len(args) != 1: - parser.print_help() - sys.exit(1) - dev.mkdir(args[0]) - elif command == "ls": - parser = OptionParser(usage="usage: %prog ls [options] path\nList files on the device\n\npath must begin with /,a:/ or b:/") - parser.add_option("--color", help="show ls output in color", dest="color", action="store_true", default=False) - parser.add_option("-l", help="In addition to the name of each file, print the file type, permissions, and timestamp (the modification time, in the local timezone). Times are local.", dest="ll", action="store_true", default=False) - parser.add_option("-R", help="Recursively list subdirectories encountered. /dev and /proc are omitted", dest="recurse", action="store_true", default=False) - parser.remove_option("-h") - parser.add_option("-h", "--human-readable", help="show sizes in human readable format", dest="hrs", action="store_true", default=False) - options, args = parser.parse_args(args) - if len(args) != 1: - parser.print_help() - return 1 - print ls(dev, args[0], term, color=options.color, recurse=options.recurse, ll=options.ll, human_readable_size=options.hrs, cols=cols), - elif command == "info": - info(dev) - elif command == "cp": - usage="usage: %prog cp [options] source destination\nCopy files to/from the device\n\n"+\ - "One of source or destination must be a path on the device. \n\nDevice paths have the form\n"+\ - "prs500:mountpoint/my/path\n"+\ - "where mountpoint is one of /, a: or b:\n\n"+\ - "source must point to a file for which you have read permissions\n"+\ - "destination must point to a file or directory for which you have write permissions" - parser = OptionParser(usage=usage) - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - return 1 - if args[0].startswith("prs500:"): - outfile = args[1] - path = args[0][7:] - if path.endswith("/"): path = path[:-1] - if os.path.isdir(outfile): - outfile = os.path.join(outfile, path[path.rfind("/")+1:]) - try: - outfile = open(outfile, "wb") - except IOError, e: - print >> sys.stderr, e - parser.print_help() - return 1 - dev.get_file(path, outfile) - outfile.close() - elif args[1].startswith("prs500:"): - try: - infile = open(args[0], "rb") - except IOError, e: - print >> sys.stderr, e - parser.print_help() - return 1 - dev.put_file(infile, args[1][7:]) - infile.close() - else: - parser.print_help() - return 1 - elif command == "cat": - outfile = sys.stdout - parser = OptionParser(usage="usage: %prog cat path\nShow file on the device\n\npath should point to a file on the device and must begin with /,a:/ or b:/") - options, args = parser.parse_args(args) - if len(args) != 1: - parser.print_help() - return 1 - if args[0].endswith("/"): path = args[0][:-1] - else: path = args[0] - outfile = sys.stdout - dev.get_file(path, outfile) - elif command == "rm": - parser = OptionParser(usage="usage: %prog rm path\nDelete files from the device\n\npath should point to a file or empty directory on the device "+\ - "and must begin with /,a:/ or b:/\n\n"+\ - "rm will DELETE the file. Be very CAREFUL") - options, args = parser.parse_args(args) - if len(args) != 1: - parser.print_help() - return 1 - dev.rm(args[0]) - elif command == "touch": - parser = OptionParser(usage="usage: %prog touch path\nCreate an empty file on the device\n\npath should point to a file on the device and must begin with /,a:/ or b:/\n\n"+ - "Unfortunately, I cant figure out how to update file times on the device, so if path already exists, touch does nothing" ) - options, args = parser.parse_args(args) - if len(args) != 1: - parser.print_help() - return 1 - dev.touch(args[0]) - else: - parser.print_help() - if dev.handle: dev.close() - return 1 - except DeviceLocked: - print >> sys.stderr, "The device is locked. Use the --unlock option" - except (ArgumentError, DeviceError), e: - print >>sys.stderr, e - return 1 - return 0 - -if __name__ == '__main__': - main() diff --git a/src/libprs500/cli/terminfo.py b/src/libprs500/cli/terminfo.py deleted file mode 100644 index 385994db6e..0000000000 --- a/src/libprs500/cli/terminfo.py +++ /dev/null @@ -1,208 +0,0 @@ -## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net -## This program is free software; you can redistribute it and/or modify -## it under the terms of the GNU General Public License as published by -## the Free Software Foundation; either version 2 of the License, or -## (at your option) any later version. -## -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. -## -## You should have received a copy of the GNU General Public License along -## with this program; if not, write to the Free Software Foundation, Inc., -## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -import sys, re - -""" Get information about the terminal we are running in """ - -class TerminalController: - """ - A class that can be used to portably generate formatted output to - a terminal. - - `TerminalController` defines a set of instance variables whose - values are initialized to the control sequence necessary to - perform a given action. These can be simply included in normal - output to the terminal: - - >>> term = TerminalController() - >>> print 'This is '+term.GREEN+'green'+term.NORMAL - - Alternatively, the `render()` method can used, which replaces - '${action}' with the string required to perform 'action': - - >>> term = TerminalController() - >>> print term.render('This is ${GREEN}green${NORMAL}') - - If the terminal doesn't support a given action, then the value of - the corresponding instance variable will be set to ''. As a - result, the above code will still work on terminals that do not - support color, except that their output will not be colored. - Also, this means that you can test whether the terminal supports a - given action by simply testing the truth value of the - corresponding instance variable: - - >>> term = TerminalController() - >>> if term.CLEAR_SCREEN: - ... print 'This terminal supports clearning the screen.' - - Finally, if the width and height of the terminal are known, then - they will be stored in the `COLS` and `LINES` attributes. - """ - # Cursor movement: - BOL = '' #: Move the cursor to the beginning of the line - UP = '' #: Move the cursor up one line - DOWN = '' #: Move the cursor down one line - LEFT = '' #: Move the cursor left one char - RIGHT = '' #: Move the cursor right one char - - # Deletion: - CLEAR_SCREEN = '' #: Clear the screen and move to home position - CLEAR_EOL = '' #: Clear to the end of the line. - CLEAR_BOL = '' #: Clear to the beginning of the line. - CLEAR_EOS = '' #: Clear to the end of the screen - - # Output modes: - BOLD = '' #: Turn on bold mode - BLINK = '' #: Turn on blink mode - DIM = '' #: Turn on half-bright mode - REVERSE = '' #: Turn on reverse-video mode - NORMAL = '' #: Turn off all modes - - # Cursor display: - HIDE_CURSOR = '' #: Make the cursor invisible - SHOW_CURSOR = '' #: Make the cursor visible - - # Terminal size: - COLS = None #: Width of the terminal (None for unknown) - LINES = None #: Height of the terminal (None for unknown) - - # Foreground colors: - BLACK = BLUE = GREEN = CYAN = RED = MAGENTA = YELLOW = WHITE = '' - - # Background colors: - BG_BLACK = BG_BLUE = BG_GREEN = BG_CYAN = '' - BG_RED = BG_MAGENTA = BG_YELLOW = BG_WHITE = '' - - _STRING_CAPABILITIES = """ - BOL=cr UP=cuu1 DOWN=cud1 LEFT=cub1 RIGHT=cuf1 - CLEAR_SCREEN=clear CLEAR_EOL=el CLEAR_BOL=el1 CLEAR_EOS=ed BOLD=bold - BLINK=blink DIM=dim REVERSE=rev UNDERLINE=smul NORMAL=sgr0 - HIDE_CURSOR=cinvis SHOW_CURSOR=cnorm""".split() - _COLORS = """BLACK BLUE GREEN CYAN RED MAGENTA YELLOW WHITE""".split() - _ANSICOLORS = "BLACK RED GREEN YELLOW BLUE MAGENTA CYAN WHITE".split() - - def __init__(self, term_stream=sys.stdout): - """ - Create a `TerminalController` and initialize its attributes - with appropriate values for the current terminal. - `term_stream` is the stream that will be used for terminal - output; if this stream is not a tty, then the terminal is - assumed to be a dumb terminal (i.e., have no capabilities). - """ - # Curses isn't available on all platforms - try: import curses - except: return - - # If the stream isn't a tty, then assume it has no capabilities. - if not term_stream.isatty(): return - - # Check the terminal type. If we fail, then assume that the - # terminal has no capabilities. - try: curses.setupterm() - except: return - - # Look up numeric capabilities. - self.COLS = curses.tigetnum('cols') - self.LINES = curses.tigetnum('lines') - - # Look up string capabilities. - for capability in self._STRING_CAPABILITIES: - (attrib, cap_name) = capability.split('=') - setattr(self, attrib, self._tigetstr(cap_name) or '') - - # Colors - set_fg = self._tigetstr('setf') - if set_fg: - for i,color in zip(range(len(self._COLORS)), self._COLORS): - setattr(self, color, curses.tparm(set_fg, i) or '') - set_fg_ansi = self._tigetstr('setaf') - if set_fg_ansi: - for i,color in zip(range(len(self._ANSICOLORS)), self._ANSICOLORS): - setattr(self, color, curses.tparm(set_fg_ansi, i) or '') - set_bg = self._tigetstr('setb') - if set_bg: - for i,color in zip(range(len(self._COLORS)), self._COLORS): - setattr(self, 'BG_'+color, curses.tparm(set_bg, i) or '') - set_bg_ansi = self._tigetstr('setab') - if set_bg_ansi: - for i,color in zip(range(len(self._ANSICOLORS)), self._ANSICOLORS): - setattr(self, 'BG_'+color, curses.tparm(set_bg_ansi, i) or '') - - def _tigetstr(self, cap_name): - # String capabilities can include "delays" of the form "$<2>". - # For any modern terminal, we should be able to just ignore - # these, so strip them out. - import curses - cap = curses.tigetstr(cap_name) or '' - return re.sub(r'\$<\d+>[/*]?', '', cap) - - def render(self, template): - """ - Replace each $-substitutions in the given template string with - the corresponding terminal control string (if it's defined) or - '' (if it's not). - """ - return re.sub(r'\$\$|\${\w+}', self._render_sub, template) - - def _render_sub(self, match): - s = match.group() - if s == '$$': return s - else: return getattr(self, s[2:-1]) - -####################################################################### -# Example use case: progress bar -####################################################################### - -class ProgressBar: - """ - A 3-line progress bar, which looks like:: - - Header - 20% [===========----------------------------------] - progress message - - The progress bar is colored, if the terminal supports color - output; and adjusts to the width of the terminal. - """ - BAR = '%3d%% ${GREEN}[${BOLD}%s%s${NORMAL}${GREEN}]${NORMAL}\n' - HEADER = '${BOLD}${CYAN}%s${NORMAL}\n\n' - - def __init__(self, term, header): - self.term = term - if not (self.term.CLEAR_EOL and self.term.UP and self.term.BOL): - raise ValueError("Terminal isn't capable enough -- you " - "should use a simpler progress dispaly.") - self.width = self.term.COLS or 75 - self.bar = term.render(self.BAR) - self.header = self.term.render(self.HEADER % header.center(self.width)) - self.cleared = 1 #: true if we haven't drawn the bar yet. - self.update(0, '') - - def update(self, percent, message): - if self.cleared: - sys.stdout.write(self.header) - self.cleared = 0 - n = int((self.width-10)*percent) - sys.stdout.write( - self.term.BOL + self.term.UP + self.term.CLEAR_EOL + - (self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) + - self.term.CLEAR_EOL + message.center(self.width)) - - def clear(self): - if not self.cleared: - sys.stdout.write(self.term.BOL + self.term.CLEAR_EOL + - self.term.UP + self.term.CLEAR_EOL + - self.term.UP + self.term.CLEAR_EOL) - self.cleared = 1 diff --git a/src/libprs500/lrf/__init__.py b/src/libprs500/lrf/__init__.py deleted file mode 100644 index f8572af390..0000000000 --- a/src/libprs500/lrf/__init__.py +++ /dev/null @@ -1,94 +0,0 @@ -## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net -## This program is free software; you can redistribute it and/or modify -## it under the terms of the GNU General Public License as published by -## the Free Software Foundation; either version 2 of the License, or -## (at your option) any later version. -## -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. -## -## You should have received a copy of the GNU General Public License along -## with this program; if not, write to the Free Software Foundation, Inc., -## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -""" -This package contains logic to read and write LRF files. The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}. -At the time fo writing, this package only supports reading and writing LRF meat information. See L{meta}. -""" - -from optparse import OptionParser, OptionValueError - -from libprs500.lrf.pylrs.pylrs import Book as _Book -from libprs500.lrf.pylrs.pylrs import TextBlock, Header, PutObj, Paragraph, TextStyle -from libprs500 import __version__ as VERSION - -__docformat__ = "epytext" -__author__ = "Kovid Goyal " - -class PRS500_PROFILE(object): - screen_width = 600 - screen_height = 800 - page_width = 575 - page_height = 747 - dpi = 166 - -def profile_from_string(option, opt_str, value, parser): - if value == 'prs500': - setattr(parser.values, option.dest, PRS500_PROFILE) - else: - raise OptionValueError('Profile: '+value+' is not implemented') - -class ConversionError(Exception): - pass - -def option_parser(usage): - parser = OptionParser(usage=usage, version='libprs500 '+VERSION, - epilog='Created by Kovid Goyal') - metadata = parser.add_option_group('METADATA OPTIONS') - metadata.add_option('--header', action='store_true', default=False, dest='header', - help='Add a header to all the pages with title and author.') - metadata.add_option("-t", "--title", action="store", type="string", \ - dest="title", help="Set the title. Default: filename.") - metadata.add_option("-a", "--author", action="store", type="string", \ - dest="author", help="Set the author. Default: %default", default='Unknown') - metadata.add_option("--freetext", action="store", type="string", \ - dest="freetext", help="Set the comments.", default=' ') - metadata.add_option("--category", action="store", type="string", \ - dest="category", help="Set the category", default=' ') - metadata.add_option('--title-sort', action='store', default='', dest='title_sort', - help='Sort key for the title') - metadata.add_option('--author-sort', action='store', default='', dest='author_sort', - help='Sort key for the author') - metadata.add_option('--publisher', action='store', default='Unknown', dest='publisher', - help='Publisher') - profiles=['prs500'] - parser.add_option('-o', '--output', action='store', default=None, \ - help='Output file name. Default is derived from input filename') - parser.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice', - choices=profiles, action='callback', callback=profile_from_string, - help='''Profile of the target device for which this LRF is ''' - '''being generated. Default: ''' + profiles[0] + ''' - Supported profiles: '''+', '.join(profiles)) - debug = parser.add_option_group('DEBUG OPTIONS') - debug.add_option('--verbose', dest='verbose', action='store_true', default=False, - help='''Be verbose while processing''') - debug.add_option('--lrs', action='store_true', dest='lrs', \ - help='Convert to LRS', default=False) - return parser - -def Book(font_delta=0, header=None, profile=PRS500_PROFILE, **settings): - ps = dict(textwidth=profile.page_width, - textheight=profile.page_height) - if header: - hdr = Header() - hb = TextBlock(textStyle=TextStyle(align='foot', fontsize=60)) - hb.append(header) - hdr.PutObj(hb) - ps['headheight'] = 30 - ps['header'] = header - ps['header'] = hdr - ps['topmargin'] = 10 - return _Book(textstyledefault=dict(fontsize=100+font_delta*20, - parindent=80, linespace=12), \ - pagestyledefault=ps, **settings) \ No newline at end of file diff --git a/src/libprs500/lrf/html/BeautifulSoup.py b/src/libprs500/lrf/html/BeautifulSoup.py deleted file mode 100644 index 6ef8ac026a..0000000000 --- a/src/libprs500/lrf/html/BeautifulSoup.py +++ /dev/null @@ -1,1767 +0,0 @@ -"""Beautiful Soup -Elixir and Tonic -"The Screen-Scraper's Friend" -http://www.crummy.com/software/BeautifulSoup/ - -Beautiful Soup parses a (possibly invalid) XML or HTML document into a -tree representation. It provides methods and Pythonic idioms that make -it easy to navigate, search, and modify the tree. - -A well-formed XML/HTML document yields a well-formed data -structure. An ill-formed XML/HTML document yields a correspondingly -ill-formed data structure. If your document is only locally -well-formed, you can use this library to find and process the -well-formed part of it. The BeautifulSoup class - -Beautiful Soup works with Python 2.2 and up. It has no external -dependencies, but you'll have more success at converting data to UTF-8 -if you also install these three packages: - -* chardet, for auto-detecting character encodings - http://chardet.feedparser.org/ -* cjkcodecs and iconv_codec, which add more encodings to the ones supported - by stock Python. - http://cjkpython.i18n.org/ - -Beautiful Soup defines classes for two main parsing strategies: - - * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific - language that kind of looks like XML. - - * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid - or invalid. This class has web browser-like heuristics for - obtaining a sensible parse tree in the face of common HTML errors. - -Beautiful Soup also defines a class (UnicodeDammit) for autodetecting -the encoding of an HTML or XML document, and converting it to -Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. - -For more than you ever wanted to know about Beautiful Soup, see the -documentation: -http://www.crummy.com/software/BeautifulSoup/documentation.html - -""" -from __future__ import generators - -__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "3.0.4" -__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson" -__license__ = "PSF" - -from sgmllib import SGMLParser, SGMLParseError -import codecs -import types -import re -import sgmllib -try: - from htmlentitydefs import name2codepoint -except ImportError: - name2codepoint = {} - -#This hack makes Beautiful Soup able to parse XML with namespaces -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') - -DEFAULT_OUTPUT_ENCODING = "utf-8" - -# First, the classes that represent markup elements. - -class PageElement: - """Contains the navigational information for some part of the page - (either a tag or a piece of text)""" - - def setup(self, parent=None, previous=None): - """Sets up the initial relations between this element and - other elements.""" - self.parent = parent - self.previous = previous - self.next = None - self.previousSibling = None - self.nextSibling = None - if self.parent and self.parent.contents: - self.previousSibling = self.parent.contents[-1] - self.previousSibling.nextSibling = self - - def replaceWith(self, replaceWith): - oldParent = self.parent - myIndex = self.parent.contents.index(self) - if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: - # We're replacing this element with one of its siblings. - index = self.parent.contents.index(replaceWith) - if index and index < myIndex: - # Furthermore, it comes before this element. That - # means that when we extract it, the index of this - # element will change. - myIndex = myIndex - 1 - self.extract() - oldParent.insert(myIndex, replaceWith) - - def extract(self): - """Destructively rips this element out of the tree.""" - if self.parent: - try: - self.parent.contents.remove(self) - except ValueError: - pass - - #Find the two elements that would be next to each other if - #this element (and any children) hadn't been parsed. Connect - #the two. - lastChild = self._lastRecursiveChild() - nextElement = lastChild.next - - if self.previous: - self.previous.next = nextElement - if nextElement: - nextElement.previous = self.previous - self.previous = None - lastChild.next = None - - self.parent = None - if self.previousSibling: - self.previousSibling.nextSibling = self.nextSibling - if self.nextSibling: - self.nextSibling.previousSibling = self.previousSibling - self.previousSibling = self.nextSibling = None - - def _lastRecursiveChild(self): - "Finds the last element beneath this object to be parsed." - lastChild = self - while hasattr(lastChild, 'contents') and lastChild.contents: - lastChild = lastChild.contents[-1] - return lastChild - - def insert(self, position, newChild): - if (isinstance(newChild, basestring) - or isinstance(newChild, unicode)) \ - and not isinstance(newChild, NavigableString): - newChild = NavigableString(newChild) - - position = min(position, len(self.contents)) - if hasattr(newChild, 'parent') and newChild.parent != None: - # We're 'inserting' an element that's already one - # of this object's children. - if newChild.parent == self: - index = self.find(newChild) - if index and index < position: - # Furthermore we're moving it further down the - # list of this object's children. That means that - # when we extract this element, our target index - # will jump down one. - position = position - 1 - newChild.extract() - - newChild.parent = self - previousChild = None - if position == 0: - newChild.previousSibling = None - newChild.previous = self - else: - previousChild = self.contents[position-1] - newChild.previousSibling = previousChild - newChild.previousSibling.nextSibling = newChild - newChild.previous = previousChild._lastRecursiveChild() - if newChild.previous: - newChild.previous.next = newChild - - newChildsLastElement = newChild._lastRecursiveChild() - - if position >= len(self.contents): - newChild.nextSibling = None - - parent = self - parentsNextSibling = None - while not parentsNextSibling: - parentsNextSibling = parent.nextSibling - parent = parent.parent - if not parent: # This is the last element in the document. - break - if parentsNextSibling: - newChildsLastElement.next = parentsNextSibling - else: - newChildsLastElement.next = None - else: - nextChild = self.contents[position] - newChild.nextSibling = nextChild - if newChild.nextSibling: - newChild.nextSibling.previousSibling = newChild - newChildsLastElement.next = nextChild - - if newChildsLastElement.next: - newChildsLastElement.next.previous = newChildsLastElement - self.contents.insert(position, newChild) - - def findNext(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears after this Tag in the document.""" - return self._findOne(self.findAllNext, name, attrs, text, **kwargs) - - def findAllNext(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns all items that match the given criteria and appear - before after Tag in the document.""" - return self._findAll(name, attrs, text, limit, self.nextGenerator) - - def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears after this Tag in the document.""" - return self._findOne(self.findNextSiblings, name, attrs, text, - **kwargs) - - def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear after this Tag in the document.""" - return self._findAll(name, attrs, text, limit, - self.nextSiblingGenerator, **kwargs) - fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x - - def findPrevious(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears before this Tag in the document.""" - return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) - - def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns all items that match the given criteria and appear - before this Tag in the document.""" - return self._findAll(name, attrs, text, limit, self.previousGenerator, - **kwargs) - fetchPrevious = findAllPrevious # Compatibility with pre-3.x - - def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears before this Tag in the document.""" - return self._findOne(self.findPreviousSiblings, name, attrs, text, - **kwargs) - - def findPreviousSiblings(self, name=None, attrs={}, text=None, - limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear before this Tag in the document.""" - return self._findAll(name, attrs, text, limit, - self.previousSiblingGenerator, **kwargs) - fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x - - def findParent(self, name=None, attrs={}, **kwargs): - """Returns the closest parent of this Tag that matches the given - criteria.""" - # NOTE: We can't use _findOne because findParents takes a different - # set of arguments. - r = None - l = self.findParents(name, attrs, 1) - if l: - r = l[0] - return r - - def findParents(self, name=None, attrs={}, limit=None, **kwargs): - """Returns the parents of this Tag that match the given - criteria.""" - - return self._findAll(name, attrs, None, limit, self.parentGenerator, - **kwargs) - fetchParents = findParents # Compatibility with pre-3.x - - #These methods do the real heavy lifting. - - def _findOne(self, method, name, attrs, text, **kwargs): - r = None - l = method(name, attrs, text, 1, **kwargs) - if l: - r = l[0] - return r - - def _findAll(self, name, attrs, text, limit, generator, **kwargs): - "Iterates over a generator looking for things that match." - - if isinstance(name, SoupStrainer): - strainer = name - else: - # Build a SoupStrainer - strainer = SoupStrainer(name, attrs, text, **kwargs) - results = ResultSet(strainer) - g = generator() - while True: - try: - i = g.next() - except StopIteration: - break - if i: - found = strainer.search(i) - if found: - results.append(found) - if limit and len(results) >= limit: - break - return results - - #These Generators can be used to navigate starting from both - #NavigableStrings and Tags. - def nextGenerator(self): - i = self - while i: - i = i.next - yield i - - def nextSiblingGenerator(self): - i = self - while i: - i = i.nextSibling - yield i - - def previousGenerator(self): - i = self - while i: - i = i.previous - yield i - - def previousSiblingGenerator(self): - i = self - while i: - i = i.previousSibling - yield i - - def parentGenerator(self): - i = self - while i: - i = i.parent - yield i - - # Utility methods - def substituteEncoding(self, str, encoding=None): - encoding = encoding or "utf-8" - return str.replace("%SOUP-ENCODING%", encoding) - - def toEncoding(self, s, encoding=None): - """Encodes an object to a string in some encoding, or to Unicode. - .""" - if isinstance(s, unicode): - if encoding: - s = s.encode(encoding) - elif isinstance(s, str): - if encoding: - s = s.encode(encoding) - else: - s = unicode(s) - else: - if encoding: - s = self.toEncoding(str(s), encoding) - else: - s = unicode(s) - return s - -class NavigableString(unicode, PageElement): - - def __getattr__(self, attr): - """text.string gives you text. This is for backwards - compatibility for Navigable*String, but for CData* it lets you - get the string without the CData wrapper.""" - if attr == 'string': - return self - else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - - def __unicode__(self): - return self.__str__(None) - - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - if encoding: - return self.encode(encoding) - else: - return self - -class CData(NavigableString): - - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) - -class ProcessingInstruction(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - output = self - if "%SOUP-ENCODING%" in output: - output = self.substituteEncoding(output, encoding) - return "" % self.toEncoding(output, encoding) - -class Comment(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) - -class Declaration(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) - -class Tag(PageElement): - - """Represents a found HTML tag with its attributes and contents.""" - - XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot", - '"' : "quote", - "&" : "amp", - "<" : "lt", - ">" : "gt" } - - def __init__(self, parser, name, attrs=None, parent=None, - previous=None): - "Basic constructor." - - # We don't actually store the parser object: that lets extracted - # chunks be garbage-collected - self.parserClass = parser.__class__ - self.isSelfClosing = parser.isSelfClosingTag(name) - self.name = name - if attrs == None: - attrs = [] - self.attrs = attrs - self.contents = [] - self.setup(parent, previous) - self.hidden = False - self.containsSubstitutions = False - - def get(self, key, default=None): - """Returns the value of the 'key' attribute for the tag, or - the value given for 'default' if it doesn't have that - attribute.""" - return self._getAttrMap().get(key, default) - - def has_key(self, key): - return self._getAttrMap().has_key(key) - - def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the tag, - and throws an exception if it's not there.""" - return self._getAttrMap()[key] - - def __iter__(self): - "Iterating over a tag iterates over its contents." - return iter(self.contents) - - def __len__(self): - "The length of a tag is the length of its list of contents." - return len(self.contents) - - def __contains__(self, x): - return x in self.contents - - def __nonzero__(self): - "A tag is non-None even if it has no contents." - return True - - def __setitem__(self, key, value): - """Setting tag[key] sets the value of the 'key' attribute for the - tag.""" - self._getAttrMap() - self.attrMap[key] = value - found = False - for i in range(0, len(self.attrs)): - if self.attrs[i][0] == key: - self.attrs[i] = (key, value) - found = True - if not found: - self.attrs.append((key, value)) - self._getAttrMap()[key] = value - - def __delitem__(self, key): - "Deleting tag[key] deletes all 'key' attributes for the tag." - for item in self.attrs: - if item[0] == key: - self.attrs.remove(item) - #We don't break because bad HTML can define the same - #attribute multiple times. - self._getAttrMap() - if self.attrMap.has_key(key): - del self.attrMap[key] - - def __call__(self, *args, **kwargs): - """Calling a tag like a function is the same as calling its - findAll() method. Eg. tag('a') returns a list of all the A tags - found within this tag.""" - return apply(self.findAll, args, kwargs) - - def __getattr__(self, tag): - #print "Getattr %s.%s" % (self.__class__, tag) - if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: - return self.find(tag[:-3]) - elif tag.find('__') != 0: - return self.find(tag) - - def __eq__(self, other): - """Returns true iff this tag has the same name, the same attributes, - and the same contents (recursively) as the given tag. - - NOTE: right now this will return false if two tags have the - same attributes in a different order. Should this be fixed?""" - if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): - return False - for i in range(0, len(self.contents)): - if self.contents[i] != other.contents[i]: - return False - return True - - def __ne__(self, other): - """Returns true iff this tag is not identical to the other tag, - as defined in __eq__.""" - return not self == other - - def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): - """Renders this tag as a string.""" - return self.__str__(encoding) - - def __unicode__(self): - return self.__str__(None) - - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - """Returns a string or Unicode representation of this tag and - its contents. To get Unicode, pass None for encoding. - - NOTE: since Python's HTML parser consumes whitespace, this - method is not certain to reproduce the whitespace present in - the original string.""" - - encodedName = self.toEncoding(self.name, encoding) - - attrs = [] - if self.attrs: - for key, val in self.attrs: - fmt = '%s="%s"' - if isString(val): - if self.containsSubstitutions and '%SOUP-ENCODING%' in val: - val = self.substituteEncoding(val, encoding) - - # The attribute value either: - # - # * Contains no embedded double quotes or single quotes. - # No problem: we enclose it in double quotes. - # * Contains embedded single quotes. No problem: - # double quotes work here too. - # * Contains embedded double quotes. No problem: - # we enclose it in single quotes. - # * Embeds both single _and_ double quotes. This - # can't happen naturally, but it can happen if - # you modify an attribute value after parsing - # the document. Now we have a bit of a - # problem. We solve it by enclosing the - # attribute in single quotes, and escaping any - # embedded single quotes to XML entities. - if '"' in val: - fmt = "%s='%s'" - # This can't happen naturally, but it can happen - # if you modify an attribute value after parsing. - if "'" in val: - val = val.replace("'", "&squot;") - - # Now we're okay w/r/t quotes. But the attribute - # value might also contain angle brackets, or - # ampersands that aren't part of entities. We need - # to escape those to XML entities too. - val = re.sub("([<>]|&(?![^\s]+;))", - lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";", - val) - - attrs.append(fmt % (self.toEncoding(key, encoding), - self.toEncoding(val, encoding))) - close = '' - closeTag = '' - if self.isSelfClosing: - close = ' /' - else: - closeTag = '' % encodedName - - indentTag, indentContents = 0, 0 - if prettyPrint: - indentTag = indentLevel - space = (' ' * (indentTag-1)) - indentContents = indentTag + 1 - contents = self.renderContents(encoding, prettyPrint, indentContents) - if self.hidden: - s = contents - else: - s = [] - attributeString = '' - if attrs: - attributeString = ' ' + ' '.join(attrs) - if prettyPrint: - s.append(space) - s.append('<%s%s%s>' % (encodedName, attributeString, close)) - if prettyPrint: - s.append("\n") - s.append(contents) - if prettyPrint and contents and contents[-1] != "\n": - s.append("\n") - if prettyPrint and closeTag: - s.append(space) - s.append(closeTag) - if prettyPrint and closeTag and self.nextSibling: - s.append("\n") - s = ''.join(s) - return s - - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): - return self.__str__(encoding, True) - - def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - """Renders the contents of this tag as a string in the given - encoding. If encoding is None, returns a Unicode string..""" - s=[] - for c in self: - text = None - if isinstance(c, NavigableString): - text = c.__str__(encoding) - elif isinstance(c, Tag): - s.append(c.__str__(encoding, prettyPrint, indentLevel)) - if text and prettyPrint: - text = text.strip() - if text: - if prettyPrint: - s.append(" " * (indentLevel-1)) - s.append(text) - if prettyPrint: - s.append("\n") - return ''.join(s) - - #Soup methods - - def find(self, name=None, attrs={}, recursive=True, text=None, - **kwargs): - """Return only the first child of this Tag matching the given - criteria.""" - r = None - l = self.findAll(name, attrs, recursive, text, 1, **kwargs) - if l: - r = l[0] - return r - findChild = find - - def findAll(self, name=None, attrs={}, recursive=True, text=None, - limit=None, **kwargs): - """Extracts a list of Tag objects that match the given - criteria. You can specify the name of the Tag and any - attributes you want the Tag to have. - - The value of a key-value pair in the 'attrs' map can be a - string, a list of strings, a regular expression object, or a - callable that takes a string and returns whether or not the - string matches for some custom definition of 'matches'. The - same is true of the tag name.""" - generator = self.recursiveChildGenerator - if not recursive: - generator = self.childGenerator - return self._findAll(name, attrs, text, limit, generator, **kwargs) - findChildren = findAll - - # Pre-3.x compatibility methods - first = find - fetch = findAll - - def fetchText(self, text=None, recursive=True, limit=None): - return self.findAll(text=text, recursive=recursive, limit=limit) - - def firstText(self, text=None, recursive=True): - return self.find(text=text, recursive=recursive) - - #Utility methods - - def append(self, tag): - """Appends the given tag to the contents of this tag.""" - self.contents.append(tag) - - #Private methods - - def _getAttrMap(self): - """Initializes a map representation of this tag's attributes, - if not already initialized.""" - if not getattr(self, 'attrMap'): - self.attrMap = {} - for (key, value) in self.attrs: - self.attrMap[key] = value - return self.attrMap - - #Generator methods - def childGenerator(self): - for i in range(0, len(self.contents)): - yield self.contents[i] - raise StopIteration - - def recursiveChildGenerator(self): - stack = [(self, 0)] - while stack: - tag, start = stack.pop() - if isinstance(tag, Tag): - for i in range(start, len(tag.contents)): - a = tag.contents[i] - yield a - if isinstance(a, Tag) and tag.contents: - if i < len(tag.contents) - 1: - stack.append((tag, i+1)) - stack.append((a, 0)) - break - raise StopIteration - -# Next, a couple classes to represent queries and their results. -class SoupStrainer: - """Encapsulates a number of ways of matching a markup element (tag or - text).""" - - def __init__(self, name=None, attrs={}, text=None, **kwargs): - self.name = name - if isString(attrs): - kwargs['class'] = attrs - attrs = None - if kwargs: - if attrs: - attrs = attrs.copy() - attrs.update(kwargs) - else: - attrs = kwargs - self.attrs = attrs - self.text = text - - def __str__(self): - if self.text: - return self.text - else: - return "%s|%s" % (self.name, self.attrs) - - def searchTag(self, markupName=None, markupAttrs={}): - found = None - markup = None - if isinstance(markupName, Tag): - markup = markupName - markupAttrs = markup - callFunctionWithTagData = callable(self.name) \ - and not isinstance(markupName, Tag) - - if (not self.name) \ - or callFunctionWithTagData \ - or (markup and self._matches(markup, self.name)) \ - or (not markup and self._matches(markupName, self.name)): - if callFunctionWithTagData: - match = self.name(markupName, markupAttrs) - else: - match = True - markupAttrMap = None - for attr, matchAgainst in self.attrs.items(): - if not markupAttrMap: - if hasattr(markupAttrs, 'get'): - markupAttrMap = markupAttrs - else: - markupAttrMap = {} - for k,v in markupAttrs: - markupAttrMap[k] = v - attrValue = markupAttrMap.get(attr) - if not self._matches(attrValue, matchAgainst): - match = False - break - if match: - if markup: - found = markup - else: - found = markupName - return found - - def search(self, markup): - #print 'looking for %s in %s' % (self, markup) - found = None - # If given a list of items, scan it for a text element that - # matches. - if isList(markup) and not isinstance(markup, Tag): - for element in markup: - if isinstance(element, NavigableString) \ - and self.search(element): - found = element - break - # If it's a Tag, make sure its name or attributes match. - # Don't bother with Tags if we're searching for text. - elif isinstance(markup, Tag): - if not self.text: - found = self.searchTag(markup) - # If it's text, make sure the text matches. - elif isinstance(markup, NavigableString) or \ - isString(markup): - if self._matches(markup, self.text): - found = markup - else: - raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ - return found - - def _matches(self, markup, matchAgainst): - #print "Matching %s against %s" % (markup, matchAgainst) - result = False - if matchAgainst == True and type(matchAgainst) == types.BooleanType: - result = markup != None - elif callable(matchAgainst): - result = matchAgainst(markup) - else: - #Custom match methods take the tag as an argument, but all - #other ways of matching match the tag name as a string. - if isinstance(markup, Tag): - markup = markup.name - if markup and not isString(markup): - markup = unicode(markup) - #Now we know that chunk is either a string, or None. - if hasattr(matchAgainst, 'match'): - # It's a regexp object. - result = markup and matchAgainst.search(markup) - elif isList(matchAgainst): - result = markup in matchAgainst - elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) - elif matchAgainst and isString(markup): - if isinstance(markup, unicode): - matchAgainst = unicode(matchAgainst) - else: - matchAgainst = str(matchAgainst) - - if not result: - result = matchAgainst == markup - return result - -class ResultSet(list): - """A ResultSet is just a list that keeps track of the SoupStrainer - that created it.""" - def __init__(self, source): - list.__init__([]) - self.source = source - -# Now, some helper functions. - -def isList(l): - """Convenience method that works with all 2.x versions of Python - to determine whether or not something is listlike.""" - return hasattr(l, '__iter__') \ - or (type(l) in (types.ListType, types.TupleType)) - -def isString(s): - """Convenience method that works with all 2.x versions of Python - to determine whether or not something is stringlike.""" - try: - return isinstance(s, unicode) or isintance(s, basestring) - except NameError: - return isinstance(s, str) - -def buildTagMap(default, *args): - """Turns a list of maps, lists, or scalars into a single map. - Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and - NESTING_RESET_TAGS maps out of lists and partial maps.""" - built = {} - for portion in args: - if hasattr(portion, 'items'): - #It's a map. Merge it. - for k,v in portion.items(): - built[k] = v - elif isList(portion): - #It's a list. Map each item to the default. - for k in portion: - built[k] = default - else: - #It's a scalar. Map it to the default. - built[portion] = default - return built - -# Now, the parser classes. - -class BeautifulStoneSoup(Tag, SGMLParser): - - """This class contains the basic parser and search code. It defines - a parser that knows nothing about tag behavior except for the - following: - - You can't close a tag without closing all the tags it encloses. - That is, "" actually means - "". - - [Another possible explanation is "", but since - this class defines no SELF_CLOSING_TAGS, it will never use that - explanation.] - - This class is useful for parsing XML or made-up markup languages, - or when BeautifulSoup makes an assumption counter to what you were - expecting.""" - - XML_ENTITY_LIST = {} - for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values(): - XML_ENTITY_LIST[i] = True - - SELF_CLOSING_TAGS = {} - NESTABLE_TAGS = {} - RESET_NESTING_TAGS = {} - QUOTE_TAGS = {} - - MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), - lambda x: x.group(1) + ' />'), - (re.compile(']*)>'), - lambda x: '') - ] - - ROOT_TAG_NAME = u'[document]' - - HTML_ENTITIES = "html" - XML_ENTITIES = "xml" - - def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, - markupMassage=True, smartQuotesTo=XML_ENTITIES, - convertEntities=None, selfClosingTags=None): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser. - - sgmllib will process most bad HTML, and the BeautifulSoup - class has some tricks for dealing with some HTML that kills - sgmllib, but Beautiful Soup can nonetheless choke or lose data - if your data uses self-closing tags or declarations - incorrectly. - - By default, Beautiful Soup uses regexes to sanitize input, - avoiding the vast majority of these problems. If the problems - don't apply to you, pass in False for markupMassage, and - you'll get better performance. - - The default parser massage techniques fix the two most common - instances of invalid HTML that choke sgmllib: - -
(No space between name of closing tag and tag close) - (Extraneous whitespace in declaration) - - You can pass in a custom list of (RE object, replace method) - tuples to get Beautiful Soup to scrub your input the way you - want.""" - - self.parseOnlyThese = parseOnlyThese - self.fromEncoding = fromEncoding - self.smartQuotesTo = smartQuotesTo - self.convertEntities = convertEntities - if self.convertEntities: - # It doesn't make sense to convert encoded characters to - # entities even while you're converting entities to Unicode. - # Just convert it all to Unicode. - self.smartQuotesTo = None - self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) - SGMLParser.__init__(self) - - if hasattr(markup, 'read'): # It's a file-type object. - markup = markup.read() - self.markup = markup - self.markupMassage = markupMassage - try: - self._feed() - except StopParsing: - pass - self.markup = None # The markup can now be GCed - - def _feed(self, inDocumentEncoding=None): - # Convert the document to Unicode. - markup = self.markup - if isinstance(markup, unicode): - if not hasattr(self, 'originalEncoding'): - self.originalEncoding = None - else: - dammit = UnicodeDammit\ - (markup, [self.fromEncoding, inDocumentEncoding], - smartQuotesTo=self.smartQuotesTo) - markup = dammit.unicode - self.originalEncoding = dammit.originalEncoding - if markup: - if self.markupMassage: - if not isList(self.markupMassage): - self.markupMassage = self.MARKUP_MASSAGE - for fix, m in self.markupMassage: - markup = fix.sub(m, markup) - self.reset() - - SGMLParser.feed(self, markup) - # Close out any unfinished strings and close all the open tags. - self.endData() - while self.currentTag.name != self.ROOT_TAG_NAME: - self.popTag() - - def __getattr__(self, methodName): - """This method routes method call requests to either the SGMLParser - superclass or the Tag superclass, depending on the method name.""" - #print "__getattr__ called on %s.%s" % (self.__class__, methodName) - - if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ - or methodName.find('do_') == 0: - return SGMLParser.__getattr__(self, methodName) - elif methodName.find('__') != 0: - return Tag.__getattr__(self, methodName) - else: - raise AttributeError - - def isSelfClosingTag(self, name): - """Returns true iff the given string is the name of a - self-closing tag according to this parser.""" - return self.SELF_CLOSING_TAGS.has_key(name) \ - or self.instanceSelfClosingTags.has_key(name) - - def reset(self): - Tag.__init__(self, self, self.ROOT_TAG_NAME) - self.hidden = 1 - SGMLParser.reset(self) - self.currentData = [] - self.currentTag = None - self.tagStack = [] - self.quoteStack = [] - self.pushTag(self) - - def popTag(self): - tag = self.tagStack.pop() - # Tags with just one string-owning child get the child as a - # 'string' property, so that soup.tag.string is shorthand for - # soup.tag.contents[0] - if len(self.currentTag.contents) == 1 and \ - isinstance(self.currentTag.contents[0], NavigableString): - self.currentTag.string = self.currentTag.contents[0] - - #print "Pop", tag.name - if self.tagStack: - self.currentTag = self.tagStack[-1] - return self.currentTag - - def pushTag(self, tag): - #print "Push", tag.name - if self.currentTag: - self.currentTag.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] - - def endData(self, containerClass=NavigableString): - if self.currentData: - currentData = ''.join(self.currentData) - if not currentData.strip(): - if '\n' in currentData: - currentData = '\n' - else: - currentData = ' ' - self.currentData = [] - if self.parseOnlyThese and len(self.tagStack) <= 1 and \ - (not self.parseOnlyThese.text or \ - not self.parseOnlyThese.search(currentData)): - return - o = containerClass(currentData) - o.setup(self.currentTag, self.previous) - if self.previous: - self.previous.next = o - self.previous = o - self.currentTag.contents.append(o) - - - def _popToTag(self, name, inclusivePop=True): - """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" - #print "Popping to %s" % name - if name == self.ROOT_TAG_NAME: - return - - numPops = 0 - mostRecentTag = None - for i in range(len(self.tagStack)-1, 0, -1): - if name == self.tagStack[i].name: - numPops = len(self.tagStack)-i - break - if not inclusivePop: - numPops = numPops - 1 - - for i in range(0, numPops): - mostRecentTag = self.popTag() - return mostRecentTag - - def _smartPop(self, name): - - """We need to pop up to the previous tag of this type, unless - one of this tag's nesting reset triggers comes between this - tag and the previous tag of this type, OR unless this tag is a - generic nesting trigger and another generic nesting trigger - comes between this tag and the previous tag of this type. - - Examples: -

FooBar

should pop to 'p', not 'b'. -

FooBar

should pop to 'table', not 'p'. -

Foo

Bar

should pop to 'tr', not 'p'. -

FooBar

should pop to 'p', not 'b'. - -

    • *
    • * should pop to 'ul', not the first 'li'. -
  • ** should pop to 'table', not the first 'tr' - tag should - implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' - """ - - nestingResetTriggers = self.NESTABLE_TAGS.get(name) - isNestable = nestingResetTriggers != None - isResetNesting = self.RESET_NESTING_TAGS.has_key(name) - popTo = None - inclusive = True - for i in range(len(self.tagStack)-1, 0, -1): - p = self.tagStack[i] - if (not p or p.name == name) and not isNestable: - #Non-nestable tags get popped to the top or to their - #last occurance. - popTo = name - break - if (nestingResetTriggers != None - and p.name in nestingResetTriggers) \ - or (nestingResetTriggers == None and isResetNesting - and self.RESET_NESTING_TAGS.has_key(p.name)): - - #If we encounter one of the nesting reset triggers - #peculiar to this tag, or we encounter another tag - #that causes nesting to reset, pop up to but not - #including that tag. - popTo = p.name - inclusive = False - break - p = p.parent - if popTo: - self._popToTag(popTo, inclusive) - - def unknown_starttag(self, name, attrs, selfClosing=0): - #print "Start tag %s: %s" % (name, attrs) - if self.quoteStack: - #This is not a real tag. - #print "<%s> is not real!" % name - attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) - self.handle_data('<%s%s>' % (name, attrs)) - return - self.endData() - - if not self.isSelfClosingTag(name) and not selfClosing: - self._smartPop(name) - - if self.parseOnlyThese and len(self.tagStack) <= 1 \ - and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): - return - - tag = Tag(self, name, attrs, self.currentTag, self.previous) - if self.previous: - self.previous.next = tag - self.previous = tag - self.pushTag(tag) - if selfClosing or self.isSelfClosingTag(name): - self.popTag() - if name in self.QUOTE_TAGS: - #print "Beginning quote (%s)" % name - self.quoteStack.append(name) - self.literal = 1 - return tag - - def unknown_endtag(self, name): - #print "End tag %s" % name - if self.quoteStack and self.quoteStack[-1] != name: - #This is not a real end tag. - #print " is not real!" % name - self.handle_data('' % name) - return - self.endData() - self._popToTag(name) - if self.quoteStack and self.quoteStack[-1] == name: - self.quoteStack.pop() - self.literal = (len(self.quoteStack) > 0) - - def handle_data(self, data): - self.currentData.append(data) - - def _toStringSubclass(self, text, subclass): - """Adds a certain piece of text to the tree as a NavigableString - subclass.""" - self.endData() - self.handle_data(text) - self.endData(subclass) - - def handle_pi(self, text): - """Handle a processing instruction as a ProcessingInstruction - object, possibly one with a %SOUP-ENCODING% slot into which an - encoding will be plugged later.""" - if text[:3] == "xml": - text = "xml version='1.0' encoding='%SOUP-ENCODING%'" - self._toStringSubclass(text, ProcessingInstruction) - - def handle_comment(self, text): - "Handle comments as Comment objects." - self._toStringSubclass(text, Comment) - - def handle_charref(self, ref): - "Handle character references as data." - if self.convertEntities in [self.HTML_ENTITIES, - self.XML_ENTITIES]: - data = unichr(int(ref)) - else: - data = '&#%s;' % ref - self.handle_data(data) - - def handle_entityref(self, ref): - """Handle entity references as data, possibly converting known - HTML entity references to the corresponding Unicode - characters.""" - data = None - if self.convertEntities == self.HTML_ENTITIES or \ - (self.convertEntities == self.XML_ENTITIES and \ - self.XML_ENTITY_LIST.get(ref)): - try: - data = unichr(name2codepoint[ref]) - except KeyError: - pass - if not data: - data = '&%s;' % ref - self.handle_data(data) - - def handle_decl(self, data): - "Handle DOCTYPEs and the like as Declaration objects." - self._toStringSubclass(data, Declaration) - - def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as a CData object.""" - j = None - if self.rawdata[i:i+9] == '', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) - else: - try: - j = SGMLParser.parse_declaration(self, i) - except SGMLParseError: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) - j = i + len(toHandle) - return j - -class BeautifulSoup(BeautifulStoneSoup): - - """This parser knows the following facts about HTML: - - * Some tags have no closing tag and should be interpreted as being - closed as soon as they are encountered. - - * The text inside some tags (ie. 'script') may contain tags which - are not really part of the document and which should be parsed - as text, not tags. If you want to parse the text as tags, you can - always fetch it and parse it explicitly. - - * Tag nesting rules: - - Most tags can't be nested at all. For instance, the occurance of - a

    tag should implicitly close the previous

    tag. - -

    Para1

    Para2 - should be transformed into: -

    Para1

    Para2 - - Some tags can be nested arbitrarily. For instance, the occurance - of a

    tag should _not_ implicitly close the previous -
    tag. - - Alice said:
    Bob said:
    Blah - should NOT be transformed into: - Alice said:
    Bob said:
    Blah - - Some tags can be nested, but the nesting is reset by the - interposition of other tags. For instance, a
    , - but not close a tag in another table. - -
    BlahBlah - should be transformed into: -
    BlahBlah - but, - Blah
    Blah - should NOT be transformed into - Blah
    Blah - - Differing assumptions about tag nesting rules are a major source - of problems with the BeautifulSoup class. If BeautifulSoup is not - treating as nestable a tag your page author treats as nestable, - try ICantBelieveItsBeautifulSoup, MinimalSoup, or - BeautifulStoneSoup before writing your own subclass.""" - - def __init__(self, *args, **kwargs): - if not kwargs.has_key('smartQuotesTo'): - kwargs['smartQuotesTo'] = self.HTML_ENTITIES - BeautifulStoneSoup.__init__(self, *args, **kwargs) - - SELF_CLOSING_TAGS = buildTagMap(None, - ['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) - - QUOTE_TAGS = {'script': None} - - #According to the HTML standard, each of these inline tags can - #contain another tag of the same type. Furthermore, it's common - #to actually use these tags this way. - NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', - 'center'] - - #According to the HTML standard, these block tags can contain - #another tag of the same type. Furthermore, it's common - #to actually use these tags this way. - NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] - - #Lists can contain other lists, but there are restrictions. - NESTABLE_LIST_TAGS = { 'ol' : [], - 'ul' : [], - 'li' : ['ul', 'ol'], - 'dl' : [], - 'dd' : ['dl'], - 'dt' : ['dl'] } - - #Tables can contain other tables, but there are restrictions. - NESTABLE_TABLE_TAGS = {'table' : [], - 'tr' : ['table', 'tbody', 'tfoot', 'thead'], - 'td' : ['tr'], - 'th' : ['tr'], - 'thead' : ['table'], - 'tbody' : ['table'], - 'tfoot' : ['table'], - } - - NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] - - #If one of these tags is encountered, all tags up to the next tag of - #this type are popped. - RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', - NON_NESTABLE_BLOCK_TAGS, - NESTABLE_LIST_TAGS, - NESTABLE_TABLE_TAGS) - - NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, - NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) - - # Used to detect the charset in a META tag; see start_meta - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)") - - def start_meta(self, attrs): - """Beautiful Soup can detect a charset included in a META tag, - try to convert the document to that charset, and re-parse the - document from the beginning.""" - httpEquiv = None - contentType = None - contentTypeIndex = None - tagNeedsEncodingSubstitution = False - - for i in range(0, len(attrs)): - key, value = attrs[i] - key = key.lower() - if key == 'http-equiv': - httpEquiv = value - elif key == 'content': - contentType = value - contentTypeIndex = i - - if httpEquiv and contentType: # It's an interesting meta tag. - match = self.CHARSET_RE.search(contentType) - if match: - if getattr(self, 'declaredHTMLEncoding') or \ - (self.originalEncoding == self.fromEncoding): - # This is our second pass through the document, or - # else an encoding was specified explicitly and it - # worked. Rewrite the meta tag. - newAttr = self.CHARSET_RE.sub\ - (lambda(match):match.group(1) + - "%SOUP-ENCODING%", value) - attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], - newAttr) - tagNeedsEncodingSubstitution = True - else: - # This is our first pass through the document. - # Go through it again with the new information. - newCharset = match.group(3) - if newCharset and newCharset != self.originalEncoding: - self.declaredHTMLEncoding = newCharset - self._feed(self.declaredHTMLEncoding) - raise StopParsing - tag = self.unknown_starttag("meta", attrs) - if tag and tagNeedsEncodingSubstitution: - tag.containsSubstitutions = True - -class StopParsing(Exception): - pass - -class ICantBelieveItsBeautifulSoup(BeautifulSoup): - - """The BeautifulSoup class is oriented towards skipping over - common HTML errors like unclosed tags. However, sometimes it makes - errors of its own. For instance, consider this fragment: - - FooBar - - This is perfectly valid (if bizarre) HTML. However, the - BeautifulSoup class will implicitly close the first b tag when it - encounters the second 'b'. It will think the author wrote - "FooBar", and didn't close the first 'b' tag, because - there's no real-world reason to bold something that's already - bold. When it encounters '' it will close two more 'b' - tags, for a grand total of three tags closed instead of two. This - can throw off the rest of your document structure. The same is - true of a number of other tags, listed below. - - It's much more common for someone to forget to close a 'b' tag - than to actually use nested 'b' tags, and the BeautifulSoup class - handles the common case. This class handles the not-co-common - case: where you can't believe someone wrote what they did, but - it's valid HTML and BeautifulSoup screwed up by assuming it - wouldn't be.""" - - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ - ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', - 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', - 'big'] - - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] - - NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) - -class MinimalSoup(BeautifulSoup): - """The MinimalSoup class is for parsing HTML that contains - pathologically bad markup. It makes no assumptions about tag - nesting, but it does know which tags are self-closing, that -