From 366207763d4f2af28f427f9dc92509061b1d6032 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 18 May 2007 06:36:54 +0000 Subject: [PATCH] Module re-organization --- .pydevproject | 3 + setup.py | 26 +- src/libprs500/devices/__init__.py | 17 + src/libprs500/{ => devices}/errors.py | 2 +- .../{device.py => devices/interface.py} | 20 - src/libprs500/{ => devices}/libusb.py | 0 src/libprs500/devices/manager.py | 53 + src/libprs500/devices/prs500/__init__.py | 18 + src/libprs500/{ => devices/prs500}/books.py | 0 src/libprs500/devices/prs500/cli/__init__.py | 21 + src/libprs500/devices/prs500/cli/main.py | 325 +++ src/libprs500/devices/prs500/cli/terminfo.py | 208 ++ .../{prs500.py => devices/prs500/driver.py} | 12 +- .../{ => devices/prs500}/prstypes.py | 2 +- src/libprs500/ebooks/BeautifulSoup.py | 1767 +++++++++++++ src/libprs500/ebooks/__init__.py | 19 + src/libprs500/ebooks/lrf/__init__.py | 94 + src/libprs500/ebooks/lrf/html/__init__.py | 20 + src/libprs500/ebooks/lrf/html/convert_from.py | 1269 +++++++++ src/libprs500/ebooks/lrf/html/demo/demo.html | 89 + src/libprs500/ebooks/lrf/html/demo/large.jpg | Bin 0 -> 46791 bytes src/libprs500/ebooks/lrf/html/demo/medium.jpg | Bin 0 -> 5152 bytes src/libprs500/ebooks/lrf/html/demo/small.jpg | Bin 0 -> 2055 bytes src/libprs500/ebooks/lrf/meta.py | 628 +++++ src/libprs500/ebooks/lrf/pylrs/__init__.py | 5 + src/libprs500/ebooks/lrf/pylrs/elements.py | 79 + src/libprs500/ebooks/lrf/pylrs/pylrf.py | 777 ++++++ src/libprs500/ebooks/lrf/pylrs/pylrfopt.py | 43 + src/libprs500/ebooks/lrf/pylrs/pylrs.py | 2332 +++++++++++++++++ src/libprs500/ebooks/lrf/read_text_stream.py | 30 + src/libprs500/ebooks/lrf/txt/__init__.py | 14 + src/libprs500/ebooks/lrf/txt/convert_from.py | 109 + src/libprs500/ebooks/metadata/__init__.py | 42 + src/libprs500/ebooks/metadata/meta.py | 26 + src/libprs500/ebooks/metadata/pdf-meta.pl | 49 + src/libprs500/ebooks/metadata/rtf.py | 101 + src/libprs500/gui/database.py | 4 +- src/libprs500/gui/editbook_ui.py | 1 - src/libprs500/gui/main.py | 4 +- src/libprs500/gui/main_ui.py | 1 - 40 files changed, 8161 insertions(+), 49 deletions(-) create mode 100644 src/libprs500/devices/__init__.py rename src/libprs500/{ => devices}/errors.py (97%) rename src/libprs500/{device.py => devices/interface.py} (91%) rename src/libprs500/{ => devices}/libusb.py (100%) create mode 100644 src/libprs500/devices/manager.py create mode 100644 src/libprs500/devices/prs500/__init__.py rename src/libprs500/{ => devices/prs500}/books.py (100%) create mode 100644 src/libprs500/devices/prs500/cli/__init__.py create mode 100755 src/libprs500/devices/prs500/cli/main.py create mode 100644 src/libprs500/devices/prs500/cli/terminfo.py rename src/libprs500/{prs500.py => devices/prs500/driver.py} (99%) rename src/libprs500/{ => devices/prs500}/prstypes.py (99%) create mode 100644 src/libprs500/ebooks/BeautifulSoup.py create mode 100644 src/libprs500/ebooks/__init__.py create mode 100644 src/libprs500/ebooks/lrf/__init__.py create mode 100644 src/libprs500/ebooks/lrf/html/__init__.py create mode 100644 src/libprs500/ebooks/lrf/html/convert_from.py create mode 100644 src/libprs500/ebooks/lrf/html/demo/demo.html create mode 100644 src/libprs500/ebooks/lrf/html/demo/large.jpg create mode 100644 src/libprs500/ebooks/lrf/html/demo/medium.jpg create mode 100644 src/libprs500/ebooks/lrf/html/demo/small.jpg create mode 100644 src/libprs500/ebooks/lrf/meta.py create mode 100644 src/libprs500/ebooks/lrf/pylrs/__init__.py create mode 100644 src/libprs500/ebooks/lrf/pylrs/elements.py create mode 100644 src/libprs500/ebooks/lrf/pylrs/pylrf.py create mode 100644 src/libprs500/ebooks/lrf/pylrs/pylrfopt.py create mode 100644 src/libprs500/ebooks/lrf/pylrs/pylrs.py create mode 100755 src/libprs500/ebooks/lrf/read_text_stream.py create mode 100644 src/libprs500/ebooks/lrf/txt/__init__.py create mode 100644 src/libprs500/ebooks/lrf/txt/convert_from.py create mode 100644 src/libprs500/ebooks/metadata/__init__.py create mode 100644 src/libprs500/ebooks/metadata/meta.py create mode 100644 src/libprs500/ebooks/metadata/pdf-meta.pl create mode 100644 src/libprs500/ebooks/metadata/rtf.py diff --git a/.pydevproject b/.pydevproject index ce1227533e..cea05cb04a 100644 --- a/.pydevproject +++ b/.pydevproject @@ -5,5 +5,8 @@ python 2.5 /libprs500/src +/libprs500/devices +/libprs500/libprs500.devices.prs500 +/libprs500/prs500 diff --git a/setup.py b/setup.py index 675afc569e..c2638aa074 100644 --- a/setup.py +++ b/setup.py @@ -29,11 +29,11 @@ if sys.argv[1] == 'py2exe': try: import py2exe console = [ - {'script' : 'src/libprs500/cli/main.py', 'dest_base':'prs500'}, - {'script' : 'src/libprs500/lrf/html/convert_from.py', 'dest_base':'html2lrf'}, - {'script' : 'src/libprs500/lrf/txt/convert_from.py', 'dest_base':'txt2lrf'}, - {'script' : 'src/libprs500/lrf/meta.py', 'dest_base':'lrf-meta'}, - {'script' : 'src/libprs500/metadata/rtf.py', 'dest_base':'rtf-meta'}, + {'script' : 'src/libprs500/devices/prs500/cli/main.py', 'dest_base':'prs500'}, + {'script' : 'src/libprs500/ebooks/lrf/html/convert_from.py', 'dest_base':'html2lrf'}, + {'script' : 'src/libprs500/ebooks/lrf/txt/convert_from.py', 'dest_base':'txt2lrf'}, + {'script' : 'src/libprs500/ebooks/lrf/meta.py', 'dest_base':'lrf-meta'}, + {'script' : 'src/libprs500/ebooks/metadata/rtf.py', 'dest_base':'rtf-meta'}, ] windows = [{'script' : 'src/libprs500/gui/main.py', 'dest_base':'prs500-gui', 'icon_resources':[(1,'icons/library.ico')]}] @@ -255,18 +255,14 @@ setup( author='Kovid Goyal', author_email='kovid@kovidgoyal.net', url = 'http://libprs500.kovidgoyal.net', - package_data = { \ - 'libprs500.gui' : ['*.ui'], \ - 'libprs500.lrf' : ['*.jar', '*.jpg'], \ - 'libprs500.metadata' : ['*.pl'] \ - }, + package_data = { 'libprs500.ebooks' : ['*.jpg', '*.pl'], }, entry_points = { 'console_scripts': [ \ - 'prs500 = libprs500.cli.main:main', \ - 'lrf-meta = libprs500.lrf.meta:main', \ - 'rtf-meta = libprs500.metadata.rtf:main', \ - 'txt2lrf = libprs500.lrf.txt.convert_from:main', \ - 'html2lrf = libprs500.lrf.html.convert_from:main',\ + 'prs500 = libprs500.devices.prs500.cli.main:main', \ + 'lrf-meta = libprs500.ebooks.lrf.meta:main', \ + 'rtf-meta = libprs500.ebooks.metadata.rtf:main', \ + 'txt2lrf = libprs500.ebooks.lrf.txt.convert_from:main', \ + 'html2lrf = libprs500.ebooks.lrf.html.convert_from:main',\ ], 'gui_scripts' : [ 'prs500-gui = libprs500.gui.main:main'] }, diff --git a/src/libprs500/devices/__init__.py b/src/libprs500/devices/__init__.py new file mode 100644 index 0000000000..a37a0ddab9 --- /dev/null +++ b/src/libprs500/devices/__init__.py @@ -0,0 +1,17 @@ +## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +''' +Device drivers +''' \ No newline at end of file diff --git a/src/libprs500/errors.py b/src/libprs500/devices/errors.py similarity index 97% rename from src/libprs500/errors.py rename to src/libprs500/devices/errors.py index 88cdc3dc97..35f040cbf8 100644 --- a/src/libprs500/errors.py +++ b/src/libprs500/devices/errors.py @@ -13,7 +13,7 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ -Defines the errors that libprs500 generates. +Defines the errors that the libprs500 device drivers generate. G{classtree ProtocolError} """ diff --git a/src/libprs500/device.py b/src/libprs500/devices/interface.py similarity index 91% rename from src/libprs500/device.py rename to src/libprs500/devices/interface.py index 712c22cace..c0ea879e78 100644 --- a/src/libprs500/device.py +++ b/src/libprs500/devices/interface.py @@ -18,8 +18,6 @@ the GUI. A device backend must subclass the L{Device} class. See prs500.py for a backend that implement the Device interface for the SONY PRS500 Reader. """ -import threading -from functools import wraps class Device(object): """ @@ -115,21 +113,3 @@ class Device(object): """ raise NotImplementedError() - -class DeviceManager(object): - - def threaded(func): - @wraps(func) - def run_in_thread(*args, **kwargs): - dm = args[0] - dm - - - def __init__(self, device): - if not isinstance(device, Device): - raise TypeError, '%s must implement the Device interface' % (str(device),) - self.dev = device - self.lock = threading.RLock() - - - diff --git a/src/libprs500/libusb.py b/src/libprs500/devices/libusb.py similarity index 100% rename from src/libprs500/libusb.py rename to src/libprs500/devices/libusb.py diff --git a/src/libprs500/devices/manager.py b/src/libprs500/devices/manager.py new file mode 100644 index 0000000000..285e18fa07 --- /dev/null +++ b/src/libprs500/devices/manager.py @@ -0,0 +1,53 @@ +## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + +''' +Define a threaded interface for working with devices. +''' + +import threading, Queue + +from libprs500.devices.device import Device +from libprs500.devices.prs500.driver import PRS500 + +class DeviceManager(object): + + def __init__(self): + self.devices = [] + self.device_jobs = Queue(0) + + +class Job(object): + count = 0 + def __init__(self, func, args): + self.completed = False + self.exception = None + + +class Worker(threading.Thread): + + def __init__(self, jobs): + self.jobs = jobs + self.results = [] + threading.Thread.__init__(self) + self.setDaemon(True) + + def run(self): + '''Thread loops taking jobs from the queue as they become available''' + while True: + job = self.jobs.get(True, None) + # Do job + self.jobs.task_done() \ No newline at end of file diff --git a/src/libprs500/devices/prs500/__init__.py b/src/libprs500/devices/prs500/__init__.py new file mode 100644 index 0000000000..ecd67d3411 --- /dev/null +++ b/src/libprs500/devices/prs500/__init__.py @@ -0,0 +1,18 @@ +## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +''' +Device driver for the Sony Reader PRS 500 +''' \ No newline at end of file diff --git a/src/libprs500/books.py b/src/libprs500/devices/prs500/books.py similarity index 100% rename from src/libprs500/books.py rename to src/libprs500/devices/prs500/books.py diff --git a/src/libprs500/devices/prs500/cli/__init__.py b/src/libprs500/devices/prs500/cli/__init__.py new file mode 100644 index 0000000000..b20d1ac2d5 --- /dev/null +++ b/src/libprs500/devices/prs500/cli/__init__.py @@ -0,0 +1,21 @@ +## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" +Provides a command-line interface to the SONY Reader PRS-500. + +For usage information run the script. +""" +__docformat__ = "epytext" +__author__ = "Kovid Goyal " diff --git a/src/libprs500/devices/prs500/cli/main.py b/src/libprs500/devices/prs500/cli/main.py new file mode 100755 index 0000000000..d48fd378ad --- /dev/null +++ b/src/libprs500/devices/prs500/cli/main.py @@ -0,0 +1,325 @@ +## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" +Provides a command-line and optional graphical interface to the SONY Reader PRS-500. + +For usage information run the script. +""" + +import StringIO, sys, time, os +from optparse import OptionParser + +from libprs500 import __version__ as VERSION +from libprs500.devices.prs500.driver import PRS500 +from libprs500.devices.prs500.cli.terminfo import TerminalController +from libprs500.devices.errors import ArgumentError, DeviceError, DeviceLocked + + +MINIMUM_COL_WIDTH = 12 #: Minimum width of columns in ls output + +def human_readable(size): + """ Convert a size in bytes into a human readle form """ + if size < 1024: divisor, suffix = 1, "" + elif size < 1024*1024: divisor, suffix = 1024., "K" + elif size < 1024*1024*1024: divisor, suffix = 1024*1024, "M" + elif size < 1024*1024*1024*1024: divisor, suffix = 1024*1024, "G" + size = str(size/divisor) + if size.find(".") > -1: size = size[:size.find(".")+2] + return size + suffix + +class FileFormatter(object): + def __init__(self, file, term): + self.term = term + self.is_dir = file.is_dir + self.is_readonly = file.is_readonly + self.size = file.size + self.ctime = file.ctime + self.wtime = file.wtime + self.name = file.name + self.path = file.path + + @apply + def mode_string(): + doc=""" The mode string for this file. There are only two modes read-only and read-write """ + def fget(self): + mode, x = "-", "-" + if self.is_dir: mode, x = "d", "x" + if self.is_readonly: mode += "r-"+x+"r-"+x+"r-"+x + else: mode += "rw"+x+"rw"+x+"rw"+x + return mode + return property(doc=doc, fget=fget) + + @apply + def isdir_name(): + doc='''Return self.name + '/' if self is a directory''' + def fget(self): + name = self.name + if self.is_dir: + name += '/' + return name + return property(doc=doc, fget=fget) + + + @apply + def name_in_color(): + doc=""" The name in ANSI text. Directories are blue, ebooks are green """ + def fget(self): + cname = self.name + blue, green, normal = "", "", "" + if self.term: blue, green, normal = self.term.BLUE, self.term.GREEN, self.term.NORMAL + if self.is_dir: cname = blue + self.name + normal + else: + ext = self.name[self.name.rfind("."):] + if ext in (".pdf", ".rtf", ".lrf", ".lrx", ".txt"): cname = green + self.name + normal + return cname + return property(doc=doc, fget=fget) + + @apply + def human_readable_size(): + doc=""" File size in human readable form """ + def fget(self): + return human_readable(self.size) + return property(doc=doc, fget=fget) + + @apply + def modification_time(): + doc=""" Last modified time in the Linux ls -l format """ + def fget(self): + return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.wtime)) + return property(doc=doc, fget=fget) + + @apply + def creation_time(): + doc=""" Last modified time in the Linux ls -l format """ + def fget(self): + return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.ctime)) + return property(doc=doc, fget=fget) + +def info(dev): + info = dev.get_device_information() + print "Device name: ", info[0] + print "Device version: ", info[1] + print "Software version:", info[2] + print "Mime type: ", info[3] + +def ls(dev, path, term, recurse=False, color=False, human_readable_size=False, ll=False, cols=0): + def col_split(l, cols): # split list l into columns + rows = len(l) / cols + if len(l) % cols: + rows += 1 + m = [] + for i in range(rows): + m.append(l[i::rows]) + return m + + def row_widths(table): # Calculate widths for each column in the row-wise table + tcols = len(table[0]) + rowwidths = [ 0 for i in range(tcols) ] + for row in table: + c = 0 + for item in row: + rowwidths[c] = len(item) if len(item) > rowwidths[c] else rowwidths[c] + c += 1 + return rowwidths + + output = StringIO.StringIO() + if path.endswith("/"): path = path[:-1] + dirs = dev.list(path, recurse) + for dir in dirs: + if recurse: print >>output, dir[0] + ":" + lsoutput, lscoloutput = [], [] + files = dir[1] + maxlen = 0 + if ll: # Calculate column width for size column + for file in files: + size = len(str(file.size)) + if human_readable_size: + file = FileFormatter(file, term) + size = len(file.human_readable_size) + if size > maxlen: maxlen = size + for file in files: + file = FileFormatter(file, term) + name = file.name if ll else file.isdir_name + lsoutput.append(name) + if color: name = file.name_in_color + lscoloutput.append(name) + if ll: + size = str(file.size) + if human_readable_size: size = file.human_readable_size + print >>output, file.mode_string, ("%"+str(maxlen)+"s")%size, file.modification_time, name + if not ll and len(lsoutput) > 0: + trytable = [] + for colwidth in range(MINIMUM_COL_WIDTH, cols): + trycols = int(cols/colwidth) + trytable = col_split(lsoutput, trycols) + works = True + for row in trytable: + row_break = False + for item in row: + if len(item) > colwidth - 1: + works, row_break = False, True + break + if row_break: break + if works: break + rowwidths = row_widths(trytable) + trytablecol = col_split(lscoloutput, len(trytable[0])) + for r in range(len(trytable)): + for c in range(len(trytable[r])): + padding = rowwidths[c] - len(trytable[r][c]) + print >>output, trytablecol[r][c], "".ljust(padding), + print >>output + print >>output + listing = output.getvalue().rstrip()+ "\n" + output.close() + return listing + +def main(): + term = TerminalController() + cols = term.COLS + if not cols: # On windows terminal width is unknown + cols = 80 + + parser = OptionParser(usage="usage: %prog [options] command args\n\ncommand is one of: info, books, df, ls, cp, mkdir, touch, cat, rm\n\n"+ + "For help on a particular command: %prog command", version="libprs500 version: " + VERSION) + parser.add_option("--log-packets", help="print out packet stream to stdout. "+\ + "The numbers in the left column are byte offsets that allow the packet size to be read off easily.", + dest="log_packets", action="store_true", default=False) + parser.add_option("--unlock", help="Unlock device with KEY. For e.g. --unlock=1234", \ + dest='key', default='-1') + parser.remove_option("-h") + parser.disable_interspersed_args() # Allow unrecognized options + options, args = parser.parse_args() + + if len(args) < 1: + parser.print_help() + return 1 + + command = args[0] + args = args[1:] + dev = PRS500(key=options.key, log_packets=options.log_packets) + try: + if command == "df": + total = dev.total_space(end_session=False) + free = dev.free_space() + where = ("Memory", "Stick", "Card") + print "Filesystem\tSize \tUsed \tAvail \tUse%" + for i in range(3): + print "%-10s\t%s\t%s\t%s\t%s"%(where[i], human_readable(total[i]), human_readable(total[i]-free[i]), human_readable(free[i]),\ + str(0 if total[i]==0 else int(100*(total[i]-free[i])/(total[i]*1.)))+"%") + elif command == "books": + print "Books in main memory:" + for book in dev.books(): + print book + print "\nBooks on storage card:" + for book in dev.books(oncard=True): print book + elif command == "mkdir": + parser = OptionParser(usage="usage: %prog mkdir [options] path\nCreate a directory on the device\n\npath must begin with /,a:/ or b:/") + if len(args) != 1: + parser.print_help() + sys.exit(1) + dev.mkdir(args[0]) + elif command == "ls": + parser = OptionParser(usage="usage: %prog ls [options] path\nList files on the device\n\npath must begin with /,a:/ or b:/") + parser.add_option("--color", help="show ls output in color", dest="color", action="store_true", default=False) + parser.add_option("-l", help="In addition to the name of each file, print the file type, permissions, and timestamp (the modification time, in the local timezone). Times are local.", dest="ll", action="store_true", default=False) + parser.add_option("-R", help="Recursively list subdirectories encountered. /dev and /proc are omitted", dest="recurse", action="store_true", default=False) + parser.remove_option("-h") + parser.add_option("-h", "--human-readable", help="show sizes in human readable format", dest="hrs", action="store_true", default=False) + options, args = parser.parse_args(args) + if len(args) != 1: + parser.print_help() + return 1 + print ls(dev, args[0], term, color=options.color, recurse=options.recurse, ll=options.ll, human_readable_size=options.hrs, cols=cols), + elif command == "info": + info(dev) + elif command == "cp": + usage="usage: %prog cp [options] source destination\nCopy files to/from the device\n\n"+\ + "One of source or destination must be a path on the device. \n\nDevice paths have the form\n"+\ + "prs500:mountpoint/my/path\n"+\ + "where mountpoint is one of /, a: or b:\n\n"+\ + "source must point to a file for which you have read permissions\n"+\ + "destination must point to a file or directory for which you have write permissions" + parser = OptionParser(usage=usage) + options, args = parser.parse_args(args) + if len(args) != 2: + parser.print_help() + return 1 + if args[0].startswith("prs500:"): + outfile = args[1] + path = args[0][7:] + if path.endswith("/"): path = path[:-1] + if os.path.isdir(outfile): + outfile = os.path.join(outfile, path[path.rfind("/")+1:]) + try: + outfile = open(outfile, "wb") + except IOError, e: + print >> sys.stderr, e + parser.print_help() + return 1 + dev.get_file(path, outfile) + outfile.close() + elif args[1].startswith("prs500:"): + try: + infile = open(args[0], "rb") + except IOError, e: + print >> sys.stderr, e + parser.print_help() + return 1 + dev.put_file(infile, args[1][7:]) + infile.close() + else: + parser.print_help() + return 1 + elif command == "cat": + outfile = sys.stdout + parser = OptionParser(usage="usage: %prog cat path\nShow file on the device\n\npath should point to a file on the device and must begin with /,a:/ or b:/") + options, args = parser.parse_args(args) + if len(args) != 1: + parser.print_help() + return 1 + if args[0].endswith("/"): path = args[0][:-1] + else: path = args[0] + outfile = sys.stdout + dev.get_file(path, outfile) + elif command == "rm": + parser = OptionParser(usage="usage: %prog rm path\nDelete files from the device\n\npath should point to a file or empty directory on the device "+\ + "and must begin with /,a:/ or b:/\n\n"+\ + "rm will DELETE the file. Be very CAREFUL") + options, args = parser.parse_args(args) + if len(args) != 1: + parser.print_help() + return 1 + dev.rm(args[0]) + elif command == "touch": + parser = OptionParser(usage="usage: %prog touch path\nCreate an empty file on the device\n\npath should point to a file on the device and must begin with /,a:/ or b:/\n\n"+ + "Unfortunately, I cant figure out how to update file times on the device, so if path already exists, touch does nothing" ) + options, args = parser.parse_args(args) + if len(args) != 1: + parser.print_help() + return 1 + dev.touch(args[0]) + else: + parser.print_help() + if dev.handle: dev.close() + return 1 + except DeviceLocked: + print >> sys.stderr, "The device is locked. Use the --unlock option" + except (ArgumentError, DeviceError), e: + print >>sys.stderr, e + return 1 + return 0 + +if __name__ == '__main__': + main() diff --git a/src/libprs500/devices/prs500/cli/terminfo.py b/src/libprs500/devices/prs500/cli/terminfo.py new file mode 100644 index 0000000000..385994db6e --- /dev/null +++ b/src/libprs500/devices/prs500/cli/terminfo.py @@ -0,0 +1,208 @@ +## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +import sys, re + +""" Get information about the terminal we are running in """ + +class TerminalController: + """ + A class that can be used to portably generate formatted output to + a terminal. + + `TerminalController` defines a set of instance variables whose + values are initialized to the control sequence necessary to + perform a given action. These can be simply included in normal + output to the terminal: + + >>> term = TerminalController() + >>> print 'This is '+term.GREEN+'green'+term.NORMAL + + Alternatively, the `render()` method can used, which replaces + '${action}' with the string required to perform 'action': + + >>> term = TerminalController() + >>> print term.render('This is ${GREEN}green${NORMAL}') + + If the terminal doesn't support a given action, then the value of + the corresponding instance variable will be set to ''. As a + result, the above code will still work on terminals that do not + support color, except that their output will not be colored. + Also, this means that you can test whether the terminal supports a + given action by simply testing the truth value of the + corresponding instance variable: + + >>> term = TerminalController() + >>> if term.CLEAR_SCREEN: + ... print 'This terminal supports clearning the screen.' + + Finally, if the width and height of the terminal are known, then + they will be stored in the `COLS` and `LINES` attributes. + """ + # Cursor movement: + BOL = '' #: Move the cursor to the beginning of the line + UP = '' #: Move the cursor up one line + DOWN = '' #: Move the cursor down one line + LEFT = '' #: Move the cursor left one char + RIGHT = '' #: Move the cursor right one char + + # Deletion: + CLEAR_SCREEN = '' #: Clear the screen and move to home position + CLEAR_EOL = '' #: Clear to the end of the line. + CLEAR_BOL = '' #: Clear to the beginning of the line. + CLEAR_EOS = '' #: Clear to the end of the screen + + # Output modes: + BOLD = '' #: Turn on bold mode + BLINK = '' #: Turn on blink mode + DIM = '' #: Turn on half-bright mode + REVERSE = '' #: Turn on reverse-video mode + NORMAL = '' #: Turn off all modes + + # Cursor display: + HIDE_CURSOR = '' #: Make the cursor invisible + SHOW_CURSOR = '' #: Make the cursor visible + + # Terminal size: + COLS = None #: Width of the terminal (None for unknown) + LINES = None #: Height of the terminal (None for unknown) + + # Foreground colors: + BLACK = BLUE = GREEN = CYAN = RED = MAGENTA = YELLOW = WHITE = '' + + # Background colors: + BG_BLACK = BG_BLUE = BG_GREEN = BG_CYAN = '' + BG_RED = BG_MAGENTA = BG_YELLOW = BG_WHITE = '' + + _STRING_CAPABILITIES = """ + BOL=cr UP=cuu1 DOWN=cud1 LEFT=cub1 RIGHT=cuf1 + CLEAR_SCREEN=clear CLEAR_EOL=el CLEAR_BOL=el1 CLEAR_EOS=ed BOLD=bold + BLINK=blink DIM=dim REVERSE=rev UNDERLINE=smul NORMAL=sgr0 + HIDE_CURSOR=cinvis SHOW_CURSOR=cnorm""".split() + _COLORS = """BLACK BLUE GREEN CYAN RED MAGENTA YELLOW WHITE""".split() + _ANSICOLORS = "BLACK RED GREEN YELLOW BLUE MAGENTA CYAN WHITE".split() + + def __init__(self, term_stream=sys.stdout): + """ + Create a `TerminalController` and initialize its attributes + with appropriate values for the current terminal. + `term_stream` is the stream that will be used for terminal + output; if this stream is not a tty, then the terminal is + assumed to be a dumb terminal (i.e., have no capabilities). + """ + # Curses isn't available on all platforms + try: import curses + except: return + + # If the stream isn't a tty, then assume it has no capabilities. + if not term_stream.isatty(): return + + # Check the terminal type. If we fail, then assume that the + # terminal has no capabilities. + try: curses.setupterm() + except: return + + # Look up numeric capabilities. + self.COLS = curses.tigetnum('cols') + self.LINES = curses.tigetnum('lines') + + # Look up string capabilities. + for capability in self._STRING_CAPABILITIES: + (attrib, cap_name) = capability.split('=') + setattr(self, attrib, self._tigetstr(cap_name) or '') + + # Colors + set_fg = self._tigetstr('setf') + if set_fg: + for i,color in zip(range(len(self._COLORS)), self._COLORS): + setattr(self, color, curses.tparm(set_fg, i) or '') + set_fg_ansi = self._tigetstr('setaf') + if set_fg_ansi: + for i,color in zip(range(len(self._ANSICOLORS)), self._ANSICOLORS): + setattr(self, color, curses.tparm(set_fg_ansi, i) or '') + set_bg = self._tigetstr('setb') + if set_bg: + for i,color in zip(range(len(self._COLORS)), self._COLORS): + setattr(self, 'BG_'+color, curses.tparm(set_bg, i) or '') + set_bg_ansi = self._tigetstr('setab') + if set_bg_ansi: + for i,color in zip(range(len(self._ANSICOLORS)), self._ANSICOLORS): + setattr(self, 'BG_'+color, curses.tparm(set_bg_ansi, i) or '') + + def _tigetstr(self, cap_name): + # String capabilities can include "delays" of the form "$<2>". + # For any modern terminal, we should be able to just ignore + # these, so strip them out. + import curses + cap = curses.tigetstr(cap_name) or '' + return re.sub(r'\$<\d+>[/*]?', '', cap) + + def render(self, template): + """ + Replace each $-substitutions in the given template string with + the corresponding terminal control string (if it's defined) or + '' (if it's not). + """ + return re.sub(r'\$\$|\${\w+}', self._render_sub, template) + + def _render_sub(self, match): + s = match.group() + if s == '$$': return s + else: return getattr(self, s[2:-1]) + +####################################################################### +# Example use case: progress bar +####################################################################### + +class ProgressBar: + """ + A 3-line progress bar, which looks like:: + + Header + 20% [===========----------------------------------] + progress message + + The progress bar is colored, if the terminal supports color + output; and adjusts to the width of the terminal. + """ + BAR = '%3d%% ${GREEN}[${BOLD}%s%s${NORMAL}${GREEN}]${NORMAL}\n' + HEADER = '${BOLD}${CYAN}%s${NORMAL}\n\n' + + def __init__(self, term, header): + self.term = term + if not (self.term.CLEAR_EOL and self.term.UP and self.term.BOL): + raise ValueError("Terminal isn't capable enough -- you " + "should use a simpler progress dispaly.") + self.width = self.term.COLS or 75 + self.bar = term.render(self.BAR) + self.header = self.term.render(self.HEADER % header.center(self.width)) + self.cleared = 1 #: true if we haven't drawn the bar yet. + self.update(0, '') + + def update(self, percent, message): + if self.cleared: + sys.stdout.write(self.header) + self.cleared = 0 + n = int((self.width-10)*percent) + sys.stdout.write( + self.term.BOL + self.term.UP + self.term.CLEAR_EOL + + (self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) + + self.term.CLEAR_EOL + message.center(self.width)) + + def clear(self): + if not self.cleared: + sys.stdout.write(self.term.BOL + self.term.CLEAR_EOL + + self.term.UP + self.term.CLEAR_EOL + + self.term.UP + self.term.CLEAR_EOL) + self.cleared = 1 diff --git a/src/libprs500/prs500.py b/src/libprs500/devices/prs500/driver.py similarity index 99% rename from src/libprs500/prs500.py rename to src/libprs500/devices/prs500/driver.py index 5ac690cb7f..25431ecad4 100755 --- a/src/libprs500/prs500.py +++ b/src/libprs500/devices/prs500/driver.py @@ -51,12 +51,12 @@ from tempfile import TemporaryFile from array import array from functools import wraps -from libprs500.device import Device -from libprs500.libusb import Error as USBError -from libprs500.libusb import get_device_by_id -from libprs500.prstypes import * -from libprs500.errors import * -from libprs500.books import BookList, fix_ids +from libprs500.devices.interface import Device +from libprs500.devices.libusb import Error as USBError +from libprs500.devices.libusb import get_device_by_id +from libprs500.devices.prs500.prstypes import * +from libprs500.devices.errors import * +from libprs500.devices.prs500.books import BookList, fix_ids from libprs500 import __author__ as AUTHOR # Protocol versions libprs500 has been tested with diff --git a/src/libprs500/prstypes.py b/src/libprs500/devices/prs500/prstypes.py similarity index 99% rename from src/libprs500/prstypes.py rename to src/libprs500/devices/prs500/prstypes.py index efb9078acd..9bc80010f7 100755 --- a/src/libprs500/prstypes.py +++ b/src/libprs500/devices/prs500/prstypes.py @@ -44,7 +44,7 @@ Answers are organized as follows: G{classtree Answer} import struct import time -from libprs500.errors import PacketError +from libprs500.devices.errors import PacketError WORD = "= len(self.contents): + newChild.nextSibling = None + + parent = self + parentsNextSibling = None + while not parentsNextSibling: + parentsNextSibling = parent.nextSibling + parent = parent.parent + if not parent: # This is the last element in the document. + break + if parentsNextSibling: + newChildsLastElement.next = parentsNextSibling + else: + newChildsLastElement.next = None + else: + nextChild = self.contents[position] + newChild.nextSibling = nextChild + if newChild.nextSibling: + newChild.nextSibling.previousSibling = newChild + newChildsLastElement.next = nextChild + + if newChildsLastElement.next: + newChildsLastElement.next.previous = newChildsLastElement + self.contents.insert(position, newChild) + + def findNext(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._findOne(self.findAllNext, name, attrs, text, **kwargs) + + def findAllNext(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before after Tag in the document.""" + return self._findAll(name, attrs, text, limit, self.nextGenerator) + + def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._findOne(self.findNextSiblings, name, attrs, text, + **kwargs) + + def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._findAll(name, attrs, text, limit, + self.nextSiblingGenerator, **kwargs) + fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x + + def findPrevious(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) + + def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._findAll(name, attrs, text, limit, self.previousGenerator, + **kwargs) + fetchPrevious = findAllPrevious # Compatibility with pre-3.x + + def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._findOne(self.findPreviousSiblings, name, attrs, text, + **kwargs) + + def findPreviousSiblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._findAll(name, attrs, text, limit, + self.previousSiblingGenerator, **kwargs) + fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x + + def findParent(self, name=None, attrs={}, **kwargs): + """Returns the closest parent of this Tag that matches the given + criteria.""" + # NOTE: We can't use _findOne because findParents takes a different + # set of arguments. + r = None + l = self.findParents(name, attrs, 1) + if l: + r = l[0] + return r + + def findParents(self, name=None, attrs={}, limit=None, **kwargs): + """Returns the parents of this Tag that match the given + criteria.""" + + return self._findAll(name, attrs, None, limit, self.parentGenerator, + **kwargs) + fetchParents = findParents # Compatibility with pre-3.x + + #These methods do the real heavy lifting. + + def _findOne(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _findAll(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if isinstance(name, SoupStrainer): + strainer = name + else: + # Build a SoupStrainer + strainer = SoupStrainer(name, attrs, text, **kwargs) + results = ResultSet(strainer) + g = generator() + while True: + try: + i = g.next() + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These Generators can be used to navigate starting from both + #NavigableStrings and Tags. + def nextGenerator(self): + i = self + while i: + i = i.next + yield i + + def nextSiblingGenerator(self): + i = self + while i: + i = i.nextSibling + yield i + + def previousGenerator(self): + i = self + while i: + i = i.previous + yield i + + def previousSiblingGenerator(self): + i = self + while i: + i = i.previousSibling + yield i + + def parentGenerator(self): + i = self + while i: + i = i.parent + yield i + + # Utility methods + def substituteEncoding(self, str, encoding=None): + encoding = encoding or "utf-8" + return str.replace("%SOUP-ENCODING%", encoding) + + def toEncoding(self, s, encoding=None): + """Encodes an object to a string in some encoding, or to Unicode. + .""" + if isinstance(s, unicode): + if encoding: + s = s.encode(encoding) + elif isinstance(s, str): + if encoding: + s = s.encode(encoding) + else: + s = unicode(s) + else: + if encoding: + s = self.toEncoding(str(s), encoding) + else: + s = unicode(s) + return s + +class NavigableString(unicode, PageElement): + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + + def __unicode__(self): + return self.__str__(None) + + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + if encoding: + return self.encode(encoding) + else: + return self + +class CData(NavigableString): + + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + return "" % NavigableString.__str__(self, encoding) + +class ProcessingInstruction(NavigableString): + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + output = self + if "%SOUP-ENCODING%" in output: + output = self.substituteEncoding(output, encoding) + return "" % self.toEncoding(output, encoding) + +class Comment(NavigableString): + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + return "" % NavigableString.__str__(self, encoding) + +class Declaration(NavigableString): + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + return "" % NavigableString.__str__(self, encoding) + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot", + '"' : "quote", + "&" : "amp", + "<" : "lt", + ">" : "gt" } + + def __init__(self, parser, name, attrs=None, parent=None, + previous=None): + "Basic constructor." + + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected + self.parserClass = parser.__class__ + self.isSelfClosing = parser.isSelfClosingTag(name) + self.name = name + if attrs == None: + attrs = [] + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + self.containsSubstitutions = False + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self._getAttrMap().get(key, default) + + def has_key(self, key): + return self._getAttrMap().has_key(key) + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self._getAttrMap()[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self._getAttrMap() + self.attrMap[key] = value + found = False + for i in range(0, len(self.attrs)): + if self.attrs[i][0] == key: + self.attrs[i] = (key, value) + found = True + if not found: + self.attrs.append((key, value)) + self._getAttrMap()[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + for item in self.attrs: + if item[0] == key: + self.attrs.remove(item) + #We don't break because bad HTML can define the same + #attribute multiple times. + self._getAttrMap() + if self.attrMap.has_key(key): + del self.attrMap[key] + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + findAll() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return apply(self.findAll, args, kwargs) + + def __getattr__(self, tag): + #print "Getattr %s.%s" % (self.__class__, tag) + if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: + return self.find(tag[:-3]) + elif tag.find('__') != 0: + return self.find(tag) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag. + + NOTE: right now this will return false if two tags have the + same attributes in a different order. Should this be fixed?""" + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + return False + for i in range(0, len(self.contents)): + if self.contents[i] != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """Renders this tag as a string.""" + return self.__str__(encoding) + + def __unicode__(self): + return self.__str__(None) + + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + """Returns a string or Unicode representation of this tag and + its contents. To get Unicode, pass None for encoding. + + NOTE: since Python's HTML parser consumes whitespace, this + method is not certain to reproduce the whitespace present in + the original string.""" + + encodedName = self.toEncoding(self.name, encoding) + + attrs = [] + if self.attrs: + for key, val in self.attrs: + fmt = '%s="%s"' + if isString(val): + if self.containsSubstitutions and '%SOUP-ENCODING%' in val: + val = self.substituteEncoding(val, encoding) + + # The attribute value either: + # + # * Contains no embedded double quotes or single quotes. + # No problem: we enclose it in double quotes. + # * Contains embedded single quotes. No problem: + # double quotes work here too. + # * Contains embedded double quotes. No problem: + # we enclose it in single quotes. + # * Embeds both single _and_ double quotes. This + # can't happen naturally, but it can happen if + # you modify an attribute value after parsing + # the document. Now we have a bit of a + # problem. We solve it by enclosing the + # attribute in single quotes, and escaping any + # embedded single quotes to XML entities. + if '"' in val: + fmt = "%s='%s'" + # This can't happen naturally, but it can happen + # if you modify an attribute value after parsing. + if "'" in val: + val = val.replace("'", "&squot;") + + # Now we're okay w/r/t quotes. But the attribute + # value might also contain angle brackets, or + # ampersands that aren't part of entities. We need + # to escape those to XML entities too. + val = re.sub("([<>]|&(?![^\s]+;))", + lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";", + val) + + attrs.append(fmt % (self.toEncoding(key, encoding), + self.toEncoding(val, encoding))) + close = '' + closeTag = '' + if self.isSelfClosing: + close = ' /' + else: + closeTag = '' % encodedName + + indentTag, indentContents = 0, 0 + if prettyPrint: + indentTag = indentLevel + space = (' ' * (indentTag-1)) + indentContents = indentTag + 1 + contents = self.renderContents(encoding, prettyPrint, indentContents) + if self.hidden: + s = contents + else: + s = [] + attributeString = '' + if attrs: + attributeString = ' ' + ' '.join(attrs) + if prettyPrint: + s.append(space) + s.append('<%s%s%s>' % (encodedName, attributeString, close)) + if prettyPrint: + s.append("\n") + s.append(contents) + if prettyPrint and contents and contents[-1] != "\n": + s.append("\n") + if prettyPrint and closeTag: + s.append(space) + s.append(closeTag) + if prettyPrint and closeTag and self.nextSibling: + s.append("\n") + s = ''.join(s) + return s + + def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): + return self.__str__(encoding, True) + + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + """Renders the contents of this tag as a string in the given + encoding. If encoding is None, returns a Unicode string..""" + s=[] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.__str__(encoding) + elif isinstance(c, Tag): + s.append(c.__str__(encoding, prettyPrint, indentLevel)) + if text and prettyPrint: + text = text.strip() + if text: + if prettyPrint: + s.append(" " * (indentLevel-1)) + s.append(text) + if prettyPrint: + s.append("\n") + return ''.join(s) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Return only the first child of this Tag matching the given + criteria.""" + r = None + l = self.findAll(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find + + def findAll(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + generator = self.recursiveChildGenerator + if not recursive: + generator = self.childGenerator + return self._findAll(name, attrs, text, limit, generator, **kwargs) + findChildren = findAll + + # Pre-3.x compatibility methods + first = find + fetch = findAll + + def fetchText(self, text=None, recursive=True, limit=None): + return self.findAll(text=text, recursive=recursive, limit=limit) + + def firstText(self, text=None, recursive=True): + return self.find(text=text, recursive=recursive) + + #Utility methods + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.contents.append(tag) + + #Private methods + + def _getAttrMap(self): + """Initializes a map representation of this tag's attributes, + if not already initialized.""" + if not getattr(self, 'attrMap'): + self.attrMap = {} + for (key, value) in self.attrs: + self.attrMap[key] = value + return self.attrMap + + #Generator methods + def childGenerator(self): + for i in range(0, len(self.contents)): + yield self.contents[i] + raise StopIteration + + def recursiveChildGenerator(self): + stack = [(self, 0)] + while stack: + tag, start = stack.pop() + if isinstance(tag, Tag): + for i in range(start, len(tag.contents)): + a = tag.contents[i] + yield a + if isinstance(a, Tag) and tag.contents: + if i < len(tag.contents) - 1: + stack.append((tag, i+1)) + stack.append((a, 0)) + break + raise StopIteration + +# Next, a couple classes to represent queries and their results. +class SoupStrainer: + """Encapsulates a number of ways of matching a markup element (tag or + text).""" + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + self.name = name + if isString(attrs): + kwargs['class'] = attrs + attrs = None + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + self.attrs = attrs + self.text = text + + def __str__(self): + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def searchTag(self, markupName=None, markupAttrs={}): + found = None + markup = None + if isinstance(markupName, Tag): + markup = markupName + markupAttrs = markup + callFunctionWithTagData = callable(self.name) \ + and not isinstance(markupName, Tag) + + if (not self.name) \ + or callFunctionWithTagData \ + or (markup and self._matches(markup, self.name)) \ + or (not markup and self._matches(markupName, self.name)): + if callFunctionWithTagData: + match = self.name(markupName, markupAttrs) + else: + match = True + markupAttrMap = None + for attr, matchAgainst in self.attrs.items(): + if not markupAttrMap: + if hasattr(markupAttrs, 'get'): + markupAttrMap = markupAttrs + else: + markupAttrMap = {} + for k,v in markupAttrs: + markupAttrMap[k] = v + attrValue = markupAttrMap.get(attr) + if not self._matches(attrValue, matchAgainst): + match = False + break + if match: + if markup: + found = markup + else: + found = markupName + return found + + def search(self, markup): + #print 'looking for %s in %s' % (self, markup) + found = None + # If given a list of items, scan it for a text element that + # matches. + if isList(markup) and not isinstance(markup, Tag): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text: + found = self.searchTag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isString(markup): + if self._matches(markup, self.text): + found = markup + else: + raise Exception, "I don't know how to match against a %s" \ + % markup.__class__ + return found + + def _matches(self, markup, matchAgainst): + #print "Matching %s against %s" % (markup, matchAgainst) + result = False + if matchAgainst == True and type(matchAgainst) == types.BooleanType: + result = markup != None + elif callable(matchAgainst): + result = matchAgainst(markup) + else: + #Custom match methods take the tag as an argument, but all + #other ways of matching match the tag name as a string. + if isinstance(markup, Tag): + markup = markup.name + if markup and not isString(markup): + markup = unicode(markup) + #Now we know that chunk is either a string, or None. + if hasattr(matchAgainst, 'match'): + # It's a regexp object. + result = markup and matchAgainst.search(markup) + elif isList(matchAgainst): + result = markup in matchAgainst + elif hasattr(matchAgainst, 'items'): + result = markup.has_key(matchAgainst) + elif matchAgainst and isString(markup): + if isinstance(markup, unicode): + matchAgainst = unicode(matchAgainst) + else: + matchAgainst = str(matchAgainst) + + if not result: + result = matchAgainst == markup + return result + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source): + list.__init__([]) + self.source = source + +# Now, some helper functions. + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return hasattr(l, '__iter__') \ + or (type(l) in (types.ListType, types.TupleType)) + +def isString(s): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is stringlike.""" + try: + return isinstance(s, unicode) or isintance(s, basestring) + except NameError: + return isinstance(s, str) + +def buildTagMap(default, *args): + """Turns a list of maps, lists, or scalars into a single map. + Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and + NESTING_RESET_TAGS maps out of lists and partial maps.""" + built = {} + for portion in args: + if hasattr(portion, 'items'): + #It's a map. Merge it. + for k,v in portion.items(): + built[k] = v + elif isList(portion): + #It's a list. Map each item to the default. + for k in portion: + built[k] = default + else: + #It's a scalar. Map it to the default. + built[portion] = default + return built + +# Now, the parser classes. + +class BeautifulStoneSoup(Tag, SGMLParser): + + """This class contains the basic parser and search code. It defines + a parser that knows nothing about tag behavior except for the + following: + + You can't close a tag without closing all the tags it encloses. + That is, "" actually means + "". + + [Another possible explanation is "", but since + this class defines no SELF_CLOSING_TAGS, it will never use that + explanation.] + + This class is useful for parsing XML or made-up markup languages, + or when BeautifulSoup makes an assumption counter to what you were + expecting.""" + + XML_ENTITY_LIST = {} + for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values(): + XML_ENTITY_LIST[i] = True + + SELF_CLOSING_TAGS = {} + NESTABLE_TAGS = {} + RESET_NESTING_TAGS = {} + QUOTE_TAGS = {} + + MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda x: x.group(1) + ' />'), + (re.compile(']*)>'), + lambda x: '') + ] + + ROOT_TAG_NAME = u'[document]' + + HTML_ENTITIES = "html" + XML_ENTITIES = "xml" + + def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, + markupMassage=True, smartQuotesTo=XML_ENTITIES, + convertEntities=None, selfClosingTags=None): + """The Soup object is initialized as the 'root tag', and the + provided markup (which can be a string or a file-like object) + is fed into the underlying parser. + + sgmllib will process most bad HTML, and the BeautifulSoup + class has some tricks for dealing with some HTML that kills + sgmllib, but Beautiful Soup can nonetheless choke or lose data + if your data uses self-closing tags or declarations + incorrectly. + + By default, Beautiful Soup uses regexes to sanitize input, + avoiding the vast majority of these problems. If the problems + don't apply to you, pass in False for markupMassage, and + you'll get better performance. + + The default parser massage techniques fix the two most common + instances of invalid HTML that choke sgmllib: + +
(No space between name of closing tag and tag close) + (Extraneous whitespace in declaration) + + You can pass in a custom list of (RE object, replace method) + tuples to get Beautiful Soup to scrub your input the way you + want.""" + + self.parseOnlyThese = parseOnlyThese + self.fromEncoding = fromEncoding + self.smartQuotesTo = smartQuotesTo + self.convertEntities = convertEntities + if self.convertEntities: + # It doesn't make sense to convert encoded characters to + # entities even while you're converting entities to Unicode. + # Just convert it all to Unicode. + self.smartQuotesTo = None + self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) + SGMLParser.__init__(self) + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + self.markup = markup + self.markupMassage = markupMassage + try: + self._feed() + except StopParsing: + pass + self.markup = None # The markup can now be GCed + + def _feed(self, inDocumentEncoding=None): + # Convert the document to Unicode. + markup = self.markup + if isinstance(markup, unicode): + if not hasattr(self, 'originalEncoding'): + self.originalEncoding = None + else: + dammit = UnicodeDammit\ + (markup, [self.fromEncoding, inDocumentEncoding], + smartQuotesTo=self.smartQuotesTo) + markup = dammit.unicode + self.originalEncoding = dammit.originalEncoding + if markup: + if self.markupMassage: + if not isList(self.markupMassage): + self.markupMassage = self.MARKUP_MASSAGE + for fix, m in self.markupMassage: + markup = fix.sub(m, markup) + self.reset() + + SGMLParser.feed(self, markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def __getattr__(self, methodName): + """This method routes method call requests to either the SGMLParser + superclass or the Tag superclass, depending on the method name.""" + #print "__getattr__ called on %s.%s" % (self.__class__, methodName) + + if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ + or methodName.find('do_') == 0: + return SGMLParser.__getattr__(self, methodName) + elif methodName.find('__') != 0: + return Tag.__getattr__(self, methodName) + else: + raise AttributeError + + def isSelfClosingTag(self, name): + """Returns true iff the given string is the name of a + self-closing tag according to this parser.""" + return self.SELF_CLOSING_TAGS.has_key(name) \ + or self.instanceSelfClosingTags.has_key(name) + + def reset(self): + Tag.__init__(self, self, self.ROOT_TAG_NAME) + self.hidden = 1 + SGMLParser.reset(self) + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.quoteStack = [] + self.pushTag(self) + + def popTag(self): + tag = self.tagStack.pop() + # Tags with just one string-owning child get the child as a + # 'string' property, so that soup.tag.string is shorthand for + # soup.tag.contents[0] + if len(self.currentTag.contents) == 1 and \ + isinstance(self.currentTag.contents[0], NavigableString): + self.currentTag.string = self.currentTag.contents[0] + + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self, containerClass=NavigableString): + if self.currentData: + currentData = ''.join(self.currentData) + if not currentData.strip(): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + self.currentData = [] + if self.parseOnlyThese and len(self.tagStack) <= 1 and \ + (not self.parseOnlyThese.text or \ + not self.parseOnlyThese.search(currentData)): + return + o = containerClass(currentData) + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + + + def _popToTag(self, name, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + for i in range(len(self.tagStack)-1, 0, -1): + if name == self.tagStack[i].name: + numPops = len(self.tagStack)-i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: +

FooBar

should pop to 'p', not 'b'. +

FooBar

should pop to 'table', not 'p'. +

Foo

Bar

should pop to 'tr', not 'p'. +

FooBar

should pop to 'p', not 'b'. + +

    • *
    • * should pop to 'ul', not the first 'li'. +
  • ** should pop to 'table', not the first 'tr' + tag should + implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.NESTABLE_TAGS.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.tagStack)-1, 0, -1): + p = self.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.RESET_NESTING_TAGS.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self._popToTag(popTo, inclusive) + + def unknown_starttag(self, name, attrs, selfClosing=0): + #print "Start tag %s: %s" % (name, attrs) + if self.quoteStack: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + self.handle_data('<%s%s>' % (name, attrs)) + return + self.endData() + + if not self.isSelfClosingTag(name) and not selfClosing: + self._smartPop(name) + + if self.parseOnlyThese and len(self.tagStack) <= 1 \ + and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): + return + + tag = Tag(self, name, attrs, self.currentTag, self.previous) + if self.previous: + self.previous.next = tag + self.previous = tag + self.pushTag(tag) + if selfClosing or self.isSelfClosingTag(name): + self.popTag() + if name in self.QUOTE_TAGS: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + self.literal = 1 + return tag + + def unknown_endtag(self, name): + #print "End tag %s" % name + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print " is not real!" % name + self.handle_data('' % name) + return + self.endData() + self._popToTag(name) + if self.quoteStack and self.quoteStack[-1] == name: + self.quoteStack.pop() + self.literal = (len(self.quoteStack) > 0) + + def handle_data(self, data): + self.currentData.append(data) + + def _toStringSubclass(self, text, subclass): + """Adds a certain piece of text to the tree as a NavigableString + subclass.""" + self.endData() + self.handle_data(text) + self.endData(subclass) + + def handle_pi(self, text): + """Handle a processing instruction as a ProcessingInstruction + object, possibly one with a %SOUP-ENCODING% slot into which an + encoding will be plugged later.""" + if text[:3] == "xml": + text = "xml version='1.0' encoding='%SOUP-ENCODING%'" + self._toStringSubclass(text, ProcessingInstruction) + + def handle_comment(self, text): + "Handle comments as Comment objects." + self._toStringSubclass(text, Comment) + + def handle_charref(self, ref): + "Handle character references as data." + if self.convertEntities in [self.HTML_ENTITIES, + self.XML_ENTITIES]: + data = unichr(int(ref)) + else: + data = '&#%s;' % ref + self.handle_data(data) + + def handle_entityref(self, ref): + """Handle entity references as data, possibly converting known + HTML entity references to the corresponding Unicode + characters.""" + data = None + if self.convertEntities == self.HTML_ENTITIES or \ + (self.convertEntities == self.XML_ENTITIES and \ + self.XML_ENTITY_LIST.get(ref)): + try: + data = unichr(name2codepoint[ref]) + except KeyError: + pass + if not data: + data = '&%s;' % ref + self.handle_data(data) + + def handle_decl(self, data): + "Handle DOCTYPEs and the like as Declaration objects." + self._toStringSubclass(data, Declaration) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as a CData object.""" + j = None + if self.rawdata[i:i+9] == '', i) + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i+9:k] + j = k+3 + self._toStringSubclass(data, CData) + else: + try: + j = SGMLParser.parse_declaration(self, i) + except SGMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + +class BeautifulSoup(BeautifulStoneSoup): + + """This parser knows the following facts about HTML: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always fetch it and parse it explicitly. + + * Tag nesting rules: + + Most tags can't be nested at all. For instance, the occurance of + a

    tag should implicitly close the previous

    tag. + +

    Para1

    Para2 + should be transformed into: +

    Para1

    Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a

    tag should _not_ implicitly close the previous +
    tag. + + Alice said:
    Bob said:
    Blah + should NOT be transformed into: + Alice said:
    Bob said:
    Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a
    , + but not close a tag in another table. + +
    BlahBlah + should be transformed into: +
    BlahBlah + but, + Blah
    Blah + should NOT be transformed into + Blah
    Blah + + Differing assumptions about tag nesting rules are a major source + of problems with the BeautifulSoup class. If BeautifulSoup is not + treating as nestable a tag your page author treats as nestable, + try ICantBelieveItsBeautifulSoup, MinimalSoup, or + BeautifulStoneSoup before writing your own subclass.""" + + def __init__(self, *args, **kwargs): + if not kwargs.has_key('smartQuotesTo'): + kwargs['smartQuotesTo'] = self.HTML_ENTITIES + BeautifulStoneSoup.__init__(self, *args, **kwargs) + + SELF_CLOSING_TAGS = buildTagMap(None, + ['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + QUOTE_TAGS = {'script': None} + + #According to the HTML standard, each of these inline tags can + #contain another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', + 'center'] + + #According to the HTML standard, these block tags can contain + #another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + + #Lists can contain other lists, but there are restrictions. + NESTABLE_LIST_TAGS = { 'ol' : [], + 'ul' : [], + 'li' : ['ul', 'ol'], + 'dl' : [], + 'dd' : ['dl'], + 'dt' : ['dl'] } + + #Tables can contain other tables, but there are restrictions. + NESTABLE_TABLE_TAGS = {'table' : [], + 'tr' : ['table', 'tbody', 'tfoot', 'thead'], + 'td' : ['tr'], + 'th' : ['tr'], + 'thead' : ['table'], + 'tbody' : ['table'], + 'tfoot' : ['table'], + } + + NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] + + #If one of these tags is encountered, all tags up to the next tag of + #this type are popped. + RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', + NON_NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, + NESTABLE_TABLE_TAGS) + + NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + + # Used to detect the charset in a META tag; see start_meta + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)") + + def start_meta(self, attrs): + """Beautiful Soup can detect a charset included in a META tag, + try to convert the document to that charset, and re-parse the + document from the beginning.""" + httpEquiv = None + contentType = None + contentTypeIndex = None + tagNeedsEncodingSubstitution = False + + for i in range(0, len(attrs)): + key, value = attrs[i] + key = key.lower() + if key == 'http-equiv': + httpEquiv = value + elif key == 'content': + contentType = value + contentTypeIndex = i + + if httpEquiv and contentType: # It's an interesting meta tag. + match = self.CHARSET_RE.search(contentType) + if match: + if getattr(self, 'declaredHTMLEncoding') or \ + (self.originalEncoding == self.fromEncoding): + # This is our second pass through the document, or + # else an encoding was specified explicitly and it + # worked. Rewrite the meta tag. + newAttr = self.CHARSET_RE.sub\ + (lambda(match):match.group(1) + + "%SOUP-ENCODING%", value) + attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], + newAttr) + tagNeedsEncodingSubstitution = True + else: + # This is our first pass through the document. + # Go through it again with the new information. + newCharset = match.group(3) + if newCharset and newCharset != self.originalEncoding: + self.declaredHTMLEncoding = newCharset + self._feed(self.declaredHTMLEncoding) + raise StopParsing + tag = self.unknown_starttag("meta", attrs) + if tag and tagNeedsEncodingSubstitution: + tag.containsSubstitutions = True + +class StopParsing(Exception): + pass + +class ICantBelieveItsBeautifulSoup(BeautifulSoup): + + """The BeautifulSoup class is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + FooBar + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "FooBar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close a 'b' tag + than to actually use nested 'b' tags, and the BeautifulSoup class + handles the common case. This class handles the not-co-common + case: where you can't believe someone wrote what they did, but + it's valid HTML and BeautifulSoup screwed up by assuming it + wouldn't be.""" + + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] + + NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + +class MinimalSoup(BeautifulSoup): + """The MinimalSoup class is for parsing HTML that contains + pathologically bad markup. It makes no assumptions about tag + nesting, but it does know which tags are self-closing, that +