diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 682c82cd1b..4a968966c7 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -291,6 +291,7 @@ from calibre.web.feeds.input import RecipeInput from calibre.ebooks.oeb.output import OEBOutput from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.mobi.output import MOBIOutput +from calibre.ebooks.lrf.output import LRFOutput from calibre.ebooks.txt.output import TXTOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pml.input import PMLInput @@ -310,7 +311,7 @@ from calibre.devices.jetbook.driver import JETBOOK plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput, TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput, FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput, - PMLOutput, MOBIOutput] + PMLOutput, MOBIOutput, LRFOutput] plugins += [PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY, EB600, JETBOOK] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 7c654f924d..502102a59a 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -236,7 +236,6 @@ OptionRecommendation(name='page_breaks_before', 'before the specified elements.') ), - OptionRecommendation(name='margin_top', recommended_value=5.0, level=OptionRecommendation.LOW, help=_('Set the top margin in pts. Default is %default. ' @@ -614,11 +613,18 @@ OptionRecommendation(name='list_recipes', if self.opts.extra_css and os.path.exists(self.opts.extra_css): self.opts.extra_css = open(self.opts.extra_css, 'rb').read() + oibl = self.opts.insert_blank_line + orps = self.opts.remove_paragraph_spacing + if self.output_plugin.file_type == 'lrf': + self.opts.insert_blank_line = False + self.opts.remove_paragraph_spacing = False flattener = CSSFlattener(fbase=fbase, fkey=fkey, lineh=self.opts.line_height, untable=self.output_plugin.file_type in ('mobi','lit'), unfloat=self.output_plugin.file_type in ('mobi', 'lit')) flattener(self.oeb, self.opts) + self.opts.insert_blank_line = oibl + self.opts.remove_paragraph_spacing = orps if self.opts.linearize_tables and \ self.output_plugin.file_type not in ('mobi', 'lrf'): diff --git a/src/calibre/ebooks/lrf/__init__.py b/src/calibre/ebooks/lrf/__init__.py index ae74e429ad..9f6be65e3a 100644 --- a/src/calibre/ebooks/lrf/__init__.py +++ b/src/calibre/ebooks/lrf/__init__.py @@ -1,43 +1,19 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -""" -This package contains logic to read and write LRF files. -The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}. """ -import sys, os -from optparse import OptionValueError -from htmlentitydefs import name2codepoint +This package contains logic to read and write LRF files. +The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}. +""" from uuid import uuid4 from calibre.ebooks.lrf.pylrs.pylrs import Book as _Book -from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, PutObj, \ - Paragraph, TextStyle, BlockStyle +from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, \ + TextStyle, BlockStyle from calibre.ebooks.lrf.fonts import FONT_FILE_MAP from calibre.ebooks import ConversionError -from calibre import __appname__, __version__, __author__, iswindows -from calibre.utils.config import OptionParser __docformat__ = "epytext" -preferred_source_formats = [ - 'LIT', - 'MOBI', - 'EPUB', - 'ODT', - 'HTML', - 'HTM', - 'XHTM', - 'XHTML', - 'PRC', - 'AZW', - 'FB2', - 'RTF', - 'PDF', - 'TXT', - 'ZIP', - 'RAR' - ] - class LRFParseError(Exception): pass @@ -55,174 +31,8 @@ class PRS500_PROFILE(object): header_height = 30 #: In px default_fonts = { 'sans': "Swis721 BT Roman", 'mono': "Courier10 BT Roman", 'serif': "Dutch801 Rm BT Roman"} - - name = 'prs500' - -profile_map = { - PRS500_PROFILE.name : PRS500_PROFILE, - } - -def profile_from_string(option, opt_str, value, parser): - try: - profile = profile_map[value] - setattr(parser.values, option.dest, profile) - except KeyError: - raise OptionValueError('Profile: '+value+' is not implemented. Implemented profiles: %s'%(profile_map.keys())) - -def option_parser(usage, gui_mode=False): - parser = OptionParser(usage=usage, gui_mode=gui_mode) - metadata = parser.add_option_group('METADATA OPTIONS') - metadata.add_option("-t", "--title", action="store", type="string", default=None,\ - dest="title", help=_("Set the title. Default: filename.")) - metadata.add_option("-a", "--author", action="store", type="string", \ - dest="author", help=_("Set the author(s). Multiple authors should be set as a comma separated list. Default: %default"), - default=_('Unknown')) - metadata.add_option("--comment", action="store", type="string", \ - dest="freetext", help=_("Set the comment."), default=_('Unknown')) - metadata.add_option("--category", action="store", type="string", \ - dest="category", help=_("Set the category"), default=_('Unknown')) - metadata.add_option('--title-sort', action='store', default='', dest='title_sort', - help=_('Sort key for the title')) - metadata.add_option('--author-sort', action='store', default='', dest='author_sort', - help=_('Sort key for the author')) - metadata.add_option('--publisher', action='store', default=_('Unknown'), dest='publisher', - help=_('Publisher')) - metadata.add_option('--cover', action='store', dest='cover', default=None, \ - help=_('Path to file containing image to be used as cover')) - metadata.add_option('--use-metadata-cover', action='store_true', default=False, - help=_('If there is a cover graphic detected in the source file, use that instead of the specified cover.')) - - parser.add_option('-o', '--output', action='store', default=None, \ - help=_('Output file name. Default is derived from input filename')) - parser.add_option('--ignore-tables', action='store_true', default=False, dest='ignore_tables', - help=_('Render HTML tables as blocks of text instead of actual tables. This is neccessary if the HTML contains very large or complex tables.')) - laf = parser.add_option_group('LOOK AND FEEL') - laf.add_option('--base-font-size', action='store', type='float', default=10., - help=_('''Specify the base font size in pts. All fonts are rescaled accordingly. This option obsoletes the --font-delta option and takes precedence over it. To use --font-delta, set this to 0. Default: %defaultpt''')) - laf.add_option('--enable-autorotation', action='store_true', default=False, - help=_('Enable autorotation of images that are wider than the screen width.'), - dest='autorotation') - laf.add_option('--wordspace', dest='wordspace', default=2.5, type='float', - help=_('Set the space between words in pts. Default is %default')) - laf.add_option('--blank-after-para', action='store_true', default=False, - dest='blank_after_para', help=_('Separate paragraphs by blank lines.')) - laf.add_option('--header', action='store_true', default=False, dest='header', - help=_('Add a header to all the pages with title and author.')) - laf.add_option('--headerformat', default="%t by %a", dest='headerformat', type='string', - help=_('Set the format of the header. %a is replaced by the author and %t by the title. Default is %default')) - laf.add_option('--header-separation', default=0, type='int', - help=_('Add extra spacing below the header. Default is %default px.')) - laf.add_option('--override-css', default=None, dest='_override_css', type='string', - help=_('Override the CSS. Can be either a path to a CSS stylesheet or a string. If it is a string it is interpreted as CSS.')) - laf.add_option('--use-spine', default=False, dest='use_spine', action='store_true', - help=_('Use the element from the OPF file to determine the order in which the HTML files are appended to the LRF. The .opf file must be in the same directory as the base HTML file.')) - laf.add_option('--minimum-indent', default=0, type='float', - help=_('Minimum paragraph indent (the indent of the first line of a paragraph) in pts. Default: %default')) - laf.add_option('--font-delta', action='store', type='float', default=0., \ - help=_("""Increase the font size by 2 * FONT_DELTA pts and """ - '''the line spacing by FONT_DELTA pts. FONT_DELTA can be a fraction.''' - """If FONT_DELTA is negative, the font size is decreased."""), - dest='font_delta') - laf.add_option('--ignore-colors', action='store_true', default=False, dest='ignore_colors', - help=_('Render all content as black on white instead of the colors specified by the HTML or CSS.')) - - page = parser.add_option_group('PAGE OPTIONS') - profiles = profile_map.keys() - page.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice', - choices=profiles, action='callback', callback=profile_from_string, - help=_('''Profile of the target device for which this LRF is ''' - '''being generated. The profile determines things like the ''' - '''resolution and screen size of the target device. ''' - '''Default: %s Supported profiles: ''')%(PRS500_PROFILE.name,)+\ - ', '.join(profiles)) - page.add_option('--left-margin', default=20, dest='left_margin', type='int', - help=_('''Left margin of page. Default is %default px.''')) - page.add_option('--right-margin', default=20, dest='right_margin', type='int', - help=_('''Right margin of page. Default is %default px.''')) - page.add_option('--top-margin', default=10, dest='top_margin', type='int', - help=_('''Top margin of page. Default is %default px.''')) - page.add_option('--bottom-margin', default=0, dest='bottom_margin', type='int', - help=_('''Bottom margin of page. Default is %default px.''')) - page.add_option('--render-tables-as-images', default=False, action='store_true', - help=_('Render tables in the HTML as images (useful if the document has large or complex tables)')) - page.add_option('--text-size-multiplier-for-rendered-tables', type='float', default=1.0, - help=_('Multiply the size of text in rendered tables by this factor. Default is %default')) - - link = parser.add_option_group('LINK PROCESSING OPTIONS') - link.add_option('--link-levels', action='store', type='int', default=sys.maxint, \ - dest='link_levels', - help=_(r'''The maximum number of levels to recursively process ''' - '''links. A value of 0 means thats links are not followed. ''' - '''A negative value means that tags are ignored.''')) - link.add_option('--link-exclude', dest='link_exclude', default='@', - help=_('''A regular expression. tags whose href ''' - '''matches will be ignored. Defaults to %default''')) - link.add_option('--no-links-in-toc', action='store_true', default=False, - dest='no_links_in_toc', - help=_('''Don't add links to the table of contents.''')) - chapter = parser.add_option_group('CHAPTER OPTIONS') - chapter.add_option('--disable-chapter-detection', action='store_true', - default=False, dest='disable_chapter_detection', - help=_('''Prevent the automatic detection chapters.''')) - chapter.add_option('--chapter-regex', dest='chapter_regex', - default='chapter|book|appendix', - help=_('''The regular expression used to detect chapter titles.''' - ''' It is searched for in heading tags (h1-h6). Defaults to %default''')) - chapter.add_option('--chapter-attr', default='$,,$', - help=_('Detect a chapter beginning at an element having the specified attribute. The format for this option is tagname regexp,attribute name,attribute value regexp. For example to match all heading tags that have the attribute class="chapter" you would use "h\d,class,chapter". You can set the attribute to "none" to match only on tag names. So for example, to match all h2 tags, you would use "h2,none,". Default is %default''')) - chapter.add_option('--page-break-before-tag', dest='page_break', default='h[12]', - help=_('''If html2lrf does not find any page breaks in the ''' - '''html file and cannot detect chapter headings, it will ''' - '''automatically insert page-breaks before the tags whose ''' - '''names match this regular expression. Defaults to %default. ''' - '''You can disable it by setting the regexp to "$". ''' - '''The purpose of this option is to try to ensure that ''' - '''there are no really long pages as this degrades the page ''' - '''turn performance of the LRF. Thus this option is ignored ''' - '''if the current page has only a few elements.''')) - chapter.add_option('--force-page-break-before-tag', dest='force_page_break', - default='$', help=_('Force a page break before tags whose names match this regular expression.')) - chapter.add_option('--force-page-break-before-attr', dest='force_page_break_attr', - default='$,,$', help=_('Force a page break before an element having the specified attribute. The format for this option is tagname regexp,attribute name,attribute value regexp. For example to match all heading tags that have the attribute class="chapter" you would use "h\d,class,chapter". Default is %default''')) - chapter.add_option('--add-chapters-to-toc', action='store_true', - default=False, dest='add_chapters_to_toc', - help=_('''Add detected chapters to the table of contents.''')) - prepro = parser.add_option_group('PREPROCESSING OPTIONS') - prepro.add_option('--baen', action='store_true', default=False, dest='baen', - help=_('''Preprocess Baen HTML files to improve generated LRF.''')) - prepro.add_option('--pdftohtml', action='store_true', default=False, dest='pdftohtml', - help=_('''You must add this option if processing files generated by pdftohtml, otherwise conversion will fail.''')) - prepro.add_option('--book-designer', action='store_true', default=False, dest='book_designer', - help=_('''Use this option on html0 files from Book Designer.''')) - - fonts = parser.add_option_group('FONT FAMILIES', - _('''Specify trutype font families for serif, sans-serif and monospace fonts. ''' - '''These fonts will be embedded in the LRF file. Note that custom fonts lead to ''' - '''slower page turns. ''' - '''For example: ''' - '''--serif-family "Times New Roman" - ''')) - fonts.add_option('--serif-family', - default=None, dest='serif_family', type='string', - help=_('The serif family of fonts to embed')) - fonts.add_option('--sans-family', - default=None, dest='sans_family', type='string', - help=_('The sans-serif family of fonts to embed')) - fonts.add_option('--mono-family', - default=None, dest='mono_family', type='string', - help=_('The monospace family of fonts to embed')) - - debug = parser.add_option_group('DEBUG OPTIONS') - debug.add_option('--verbose', dest='verbose', action='store_true', default=False, - help=_('''Be verbose while processing''')) - debug.add_option('--lrs', action='store_true', dest='lrs', \ - help=_('Convert to LRS'), default=False) - parser.add_option('--minimize-memory-usage', action='store_true', default=False, - help=_('Minimize memory usage at the cost of longer processing times. Use this option if you are on a memory constrained machine.')) - parser.add_option('--encoding', default=None, - help=_('Specify the character encoding of the source file. If the output LRF file contains strange characters, try changing this option. A common encoding for files from windows computers is cp-1252. Another common choice is utf-8. The default is to try and guess the encoding.')) - - return parser + + name = 'prs500' def find_custom_fonts(options, logger): from calibre.utils.fontconfig import files_for_family @@ -238,16 +48,16 @@ def find_custom_fonts(options, logger): f = family(options.sans_family) fonts['sans'] = files_for_family(f) if not fonts['sans']: - logger.warn('Unable to find sans family %s'%f) + logger.warn('Unable to find sans family %s'%f) if options.mono_family: f = family(options.mono_family) fonts['mono'] = files_for_family(f) if not fonts['mono']: - logger.warn('Unable to find mono family %s'%f) + logger.warn('Unable to find mono family %s'%f) return fonts - - -def Book(options, logger, font_delta=0, header=None, + + +def Book(options, logger, font_delta=0, header=None, profile=PRS500_PROFILE, **settings): ps = {} ps['topmargin'] = options.top_margin @@ -258,7 +68,7 @@ def Book(options, logger, font_delta=0, header=None, - profile.fudge if header: hdr = Header() - hb = TextBlock(textStyle=TextStyle(align='foot', + hb = TextBlock(textStyle=TextStyle(align='foot', fontsize=int(profile.header_font_size*10)), blockStyle=BlockStyle(blockwidth=ps['textwidth'])) hb.append(header) @@ -269,20 +79,20 @@ def Book(options, logger, font_delta=0, header=None, ps['topmargin'] = 0 ps['textheight'] = profile.screen_height - (options.bottom_margin + ps['topmargin']) \ - ps['headheight'] - ps['headsep'] - profile.fudge - + fontsize = int(10*profile.font_size+font_delta*20) baselineskip = fontsize + 20 fonts = find_custom_fonts(options, logger) - tsd = dict(fontsize=fontsize, - parindent=int(10*profile.parindent), + tsd = dict(fontsize=fontsize, + parindent=int(10*profile.parindent), linespace=int(10*profile.line_space), baselineskip=baselineskip, wordspace=10*options.wordspace) if fonts['serif'] and fonts['serif'].has_key('normal'): tsd['fontfacename'] = fonts['serif']['normal'][1] - - book = _Book(textstyledefault=tsd, - pagestyledefault=ps, + + book = _Book(textstyledefault=tsd, + pagestyledefault=ps, blockstyledefault=dict(blockwidth=ps['textwidth']), bookid=uuid4().hex, **settings) @@ -291,7 +101,7 @@ def Book(options, logger, font_delta=0, header=None, for font in fonts[family].values(): book.embed_font(*font) FONT_FILE_MAP[font[1]] = font[0] - + for family in ['serif', 'sans', 'mono']: if not fonts[family]: fonts[family] = { 'normal' : (None, profile.default_fonts[family]) } @@ -299,4 +109,3 @@ def Book(options, logger, font_delta=0, header=None, raise ConversionError, 'Could not find the normal version of the ' + family + ' font' return book, fonts -from calibre import entity_to_unicode diff --git a/src/calibre/ebooks/lrf/any/__init__.py b/src/calibre/ebooks/lrf/any/__init__.py deleted file mode 100644 index f832dbb7fc..0000000000 --- a/src/calibre/ebooks/lrf/any/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' diff --git a/src/calibre/ebooks/lrf/any/convert_from.py b/src/calibre/ebooks/lrf/any/convert_from.py deleted file mode 100644 index fdfe1c54d5..0000000000 --- a/src/calibre/ebooks/lrf/any/convert_from.py +++ /dev/null @@ -1,199 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -'''Convert any ebook file into a LRF file.''' - -import sys, os, logging, shutil, tempfile, re - -from calibre.ebooks import UnknownFormatError -from calibre.ebooks.lrf import option_parser as _option_parser -from calibre import __appname__, setup_cli_handlers, extract -from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.lrf.lit.convert_from import process_file as lit2lrf -from calibre.ebooks.lrf.pdf.convert_from import process_file as pdf2lrf -from calibre.ebooks.lrf.rtf.convert_from import process_file as rtf2lrf -from calibre.ebooks.lrf.txt.convert_from import process_file as txt2lrf -from calibre.ebooks.lrf.html.convert_from import process_file as html2lrf -from calibre.ebooks.lrf.epub.convert_from import process_file as epub2lrf -from calibre.ebooks.lrf.mobi.convert_from import process_file as mobi2lrf -from calibre.ebooks.lrf.fb2.convert_from import process_file as fb22lrf - -from calibre.customize.ui import run_plugins_on_postprocess, run_plugins_on_preprocess - -def largest_file(files): - maxsize, file = 0, None - for f in files: - size = os.stat(f).st_size - if size > maxsize: - maxsize = size - file = f - return file - -def find_htmlfile(dir): - ext_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE) - toc_pat = re.compile(r'toc', re.IGNORECASE) - index_pat = re.compile(r'index', re.IGNORECASE) - toc_files, index_files, files = [], [], [] - - for root, dirs, _files in os.walk(dir): - for f in _files: - f = os.path.abspath(os.path.join(root, f)) - ext = os.path.splitext(f)[1] - if ext and ext_pat.match(ext): - toc_files.append(f) if toc_pat.search(f) else \ - index_files.append(f) if index_pat.search(f) else \ - files.append(f) - a = toc_files if toc_files else index_files if index_files else files - if a: - return largest_file(a) - -def number_of_unhidden_files(base, listing): - ans = 0 - for i in listing: - i = os.path.join(base, i) - if os.path.isdir(i) or os.path.basename(i).startswith('.'): - continue - ans += 1 - return ans - -def unhidden_directories(base, listing): - ans = [] - for i in listing: - if os.path.isdir(os.path.join(base, i)) and not i.startswith('__') and \ - not i.startswith('.'): - ans.append(i) - return ans - -def traverse_subdirs(tdir): - temp = os.listdir(tdir) - if number_of_unhidden_files(tdir, temp) == 0: - try: - cdir = os.path.join(tdir, unhidden_directories(tdir, temp)[0]) - return traverse_subdirs(cdir) - except IndexError: - pass - return tdir - -def handle_archive(path): - tdir = tempfile.mkdtemp(prefix=__appname__+'_'+'archive_') - extract(path, tdir) - files = [] - cdir = traverse_subdirs(tdir) - file = None - exts = ['lit', 'rtf', 'fb2','pdf', 'txt', 'epub', 'mobi', 'prc'] - candidates = map(lambda x:os.path.join(cdir, x), os.listdir(cdir)) - for ext in exts: - for f in candidates: - if f.lower().endswith('.'+ext): - files.append(f) - file = largest_file(files) - if not file: - file = find_htmlfile(cdir) - if isinstance(file, str): - file = file.decode(sys.getfilesystemencoding()) - return tdir, file - -def odt2lrf(path, options, logger): - from calibre.ebooks.odt.to_oeb import Extract - from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file - - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('odt2lrf') - setup_cli_handlers(logger, level) - - with TemporaryDirectory('_odt2lrf') as tdir: - opf = Extract()(path, tdir) - options.use_spine = True - options.encoding = 'utf-8' - html_process_file(opf.replace('metadata.opf', 'index.html'), options, logger) - -def process_file(path, options, logger=None): - path = os.path.abspath(os.path.expanduser(path)) - path = run_plugins_on_preprocess(path) - tdir = None - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('any2lrf') - setup_cli_handlers(logger, level) - if not os.access(path, os.R_OK): - logger.critical('Cannot read from %s', path) - return 1 - ext = os.path.splitext(path)[1] - if not ext or ext == '.': - logger.critical('Unknown file type: %s', path) - return 1 - ext = ext[1:].lower() - cwd = os.getcwd() - if not options.output: - fmt = '.lrs' if options.lrs else '.lrf' - options.output = os.path.splitext(os.path.basename(path))[0] + fmt - options.output = os.path.abspath(os.path.expanduser(options.output)) - if ext in ['zip', 'rar', 'oebzip']: - newpath = None - try: - tdir, newpath = handle_archive(path) - except: - logger.exception(' ') - if not newpath: - raise UnknownFormatError('Could not find ebook in archive') - path = newpath - logger.info('Found ebook in archive: %s', repr(path)) - try: - ext = os.path.splitext(path)[1][1:].lower() - convertor = None - if 'htm' in ext: - convertor = html2lrf - elif 'lit' == ext: - convertor = lit2lrf - elif 'pdf' == ext: - convertor = pdf2lrf - elif 'rtf' == ext: - convertor = rtf2lrf - elif 'txt' == ext: - convertor = txt2lrf - elif 'epub' == ext: - convertor = epub2lrf - elif ext in ['mobi', 'prc', 'azw']: - convertor = mobi2lrf - elif ext == 'fb2': - convertor = fb22lrf - elif ext == 'odt': - convertor = odt2lrf - if not convertor: - raise UnknownFormatError(_('Converting from %s to LRF is not supported.')%ext) - convertor(path, options, logger) - - finally: - os.chdir(cwd) - if tdir and os.path.exists(tdir): - shutil.rmtree(tdir) - return 0 - - -def option_parser(gui_mode=False): - return _option_parser(usage=_('''\ -any2lrf [options] myfile - -Convert any ebook format into LRF. Supported formats are: -LIT, RTF, TXT, HTML, EPUB, MOBI, PRC and PDF. any2lrf will also process a RAR or -ZIP archive, looking for an ebook inside the archive. - '''), gui_mode=gui_mode) - - -def main(args=sys.argv, logger=None, gui_mode=False): - parser = option_parser(gui_mode) - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print _('No file to convert specified.') - return 1 - - src = args[1] - if not isinstance(src, unicode): - src = src.decode(sys.getfilesystemencoding()) - return process_file(src, options, logger) - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/epub/__init__.py b/src/calibre/ebooks/lrf/epub/__init__.py deleted file mode 100644 index ab32bc9c41..0000000000 --- a/src/calibre/ebooks/lrf/epub/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' - diff --git a/src/calibre/ebooks/lrf/epub/convert_from.py b/src/calibre/ebooks/lrf/epub/convert_from.py deleted file mode 100644 index c564930ea5..0000000000 --- a/src/calibre/ebooks/lrf/epub/convert_from.py +++ /dev/null @@ -1,75 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' - -import os, sys, shutil, logging -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks import ConversionError, DRMError -from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file -from calibre.ebooks.metadata.opf import OPF -from calibre.ebooks.metadata.epub import OCFDirReader -from calibre.utils.zipfile import ZipFile -from calibre import setup_cli_handlers -from calibre.ptempfile import PersistentTemporaryDirectory - - -def option_parser(): - return lrf_option_parser( -_('''Usage: %prog [options] mybook.epub - - -%prog converts mybook.epub to mybook.lrf''') - ) - -def generate_html(pathtoepub, logger): - if not os.access(pathtoepub, os.R_OK): - raise ConversionError('Cannot read from ' + pathtoepub) - tdir = PersistentTemporaryDirectory('_epub2lrf') - #os.rmdir(tdir) - try: - ZipFile(pathtoepub).extractall(tdir) - except: - raise ConversionError, '.epub extraction failed' - if os.path.exists(os.path.join(tdir, 'META-INF', 'encryption.xml')): - raise DRMError(os.path.basename(pathtoepub)) - - return tdir - -def process_file(path, options, logger=None): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('epub2lrf') - setup_cli_handlers(logger, level) - epub = os.path.abspath(os.path.expanduser(path)) - tdir = generate_html(epub, logger) - try: - ocf = OCFDirReader(tdir) - htmlfile = ocf.opf.spine[0].path - options.opf = os.path.join(tdir, ocf.container[OPF.MIMETYPE]) - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) - options.use_spine = True - - html_process_file(htmlfile, options, logger=logger) - finally: - try: - shutil.rmtree(tdir) - except: - logger.warning('Failed to delete temporary directory '+tdir) - - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No epub file specified' - return 1 - process_file(args[1], options, logger) - return 0 - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/feeds/__init__.py b/src/calibre/ebooks/lrf/feeds/__init__.py deleted file mode 100644 index ec763fbda7..0000000000 --- a/src/calibre/ebooks/lrf/feeds/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' diff --git a/src/calibre/ebooks/lrf/feeds/convert_from.py b/src/calibre/ebooks/lrf/feeds/convert_from.py deleted file mode 100644 index 6965ea7bf3..0000000000 --- a/src/calibre/ebooks/lrf/feeds/convert_from.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import with_statement -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -Convert web feeds to LRF files. -''' -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks.lrf.html.convert_from import process_file -from calibre.web.feeds.main import option_parser as feeds_option_parser -from calibre.web.feeds.main import run_recipe -from calibre.ptempfile import TemporaryDirectory -from calibre import sanitize_file_name, strftime - -import sys, os - -def option_parser(): - parser = feeds_option_parser() - parser.remove_option('--output-dir') - parser.remove_option('--lrf') - parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk')) - lrf_parser = lrf_option_parser('') - lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf')) - parser.merge(lrf_parser) - return parser - -def main(args=sys.argv, notification=None, handler=None): - parser = option_parser() - opts, args = parser.parse_args(args) - opts.lrf = True - - if len(args) != 2 and opts.feeds is None: - parser.print_help() - return 1 - - recipe_arg = args[1] if len(args) > 1 else None - - with TemporaryDirectory('_feeds2lrf') as tdir: - opts.output_dir = tdir - - recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler) - - htmlfile = os.path.join(tdir, 'index.html') - if not os.access(htmlfile, os.R_OK): - raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg) - - lparser = lrf_option_parser('') - ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0] - parser.merge_options(ropts, opts) - - if not opts.output: - ext = '.lrs' if opts.lrs else '.lrf' - fname = recipe.title + strftime(recipe.timefmt)+ext - opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname)) - print 'Generating LRF...' - process_file(htmlfile, opts) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/html/convert_from.py b/src/calibre/ebooks/lrf/html/convert_from.py index ebfdecc6f4..515ec4182d 100644 --- a/src/calibre/ebooks/lrf/html/convert_from.py +++ b/src/calibre/ebooks/lrf/html/convert_from.py @@ -1,12 +1,12 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -""" +""" Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. """ -import os, re, sys, copy, glob, logging, tempfile +import os, re, sys, copy, glob, tempfile from collections import deque from urllib import unquote from urlparse import urlparse @@ -16,6 +16,7 @@ from calibre.customize.ui import run_plugins_on_postprocess try: from PIL import Image as PILImage + PILImage except ImportError: import Image as PILImage @@ -25,13 +26,12 @@ from calibre.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ TextBlock, ImageBlock, JumpButton, CharButton, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ LrsError, Sup, Sub, EmpLine -from calibre.ebooks.lrf.pylrs.pylrs import Span -from calibre.ebooks.lrf import Book, entity_to_unicode -from calibre.ebooks.lrf import option_parser as lrf_option_parser +from calibre.ebooks.lrf.pylrs.pylrs import Span +from calibre.ebooks.lrf import Book from calibre.ebooks import ConversionError -from calibre.ebooks.lrf.html.table import Table -from calibre import filename_to_utf8, setup_cli_handlers, __appname__, \ - fit_image, preferred_encoding +from calibre.ebooks.lrf.html.table import Table +from calibre import filename_to_utf8, __appname__, \ + fit_image, preferred_encoding, entity_to_unicode from calibre.ptempfile import PersistentTemporaryFile from calibre.devices.interface import DevicePlugin as Device from calibre.ebooks.lrf.html.color_map import lrs_color @@ -43,7 +43,7 @@ def update_css(ncss, ocss): ocss[key].update(ncss[key]) else: ocss[key] = ncss[key] - + def munge_paths(basepath, url): purl = urlparse(unquote(url),) path, fragment = purl[2], purl[5] @@ -74,7 +74,7 @@ def strip_style_comments(match): return src def tag_regex(tagname): - '''Return non-grouping regular expressions that match the opening and closing tags for tagname''' + '''Return non-grouping regular expressions that match the opening and closing tags for tagname''' return dict(open=r'(?:<\s*%(t)s\s+[^<>]*?>|<\s*%(t)s\s*>)'%dict(t=tagname), \ close=r''%dict(t=tagname)) @@ -82,49 +82,49 @@ class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) - - + + MARKUP_MASSAGE = [ # Close tags - (re.compile(r']*)?/>', re.IGNORECASE), + (re.compile(r']*)?/>', re.IGNORECASE), lambda match: ''), - # Strip comments from )', re.IGNORECASE|re.DOTALL), strip_style_comments), - + # Remove self closing script tags as they also mess up BeautifulSoup (re.compile(r'(?i)]+?/>'), lambda match: ''), - + # BeautifulSoup treats self closing
tags as open
tags - (re.compile(r'(?i)<\s*div([^>]*)/\s*>'), + (re.compile(r'(?i)<\s*div([^>]*)/\s*>'), lambda match: '
'%match.group(1)) - + ] # Fix Baen markup - BAEN = [ - (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), + BAEN = [ + (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), lambda match: match.group(1)), - (re.compile(r'

\s*(\s*)\s*

', re.IGNORECASE), + (re.compile(r'

\s*(\s*)\s*

', re.IGNORECASE), lambda match: match.group(1)), - (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*', re.IGNORECASE), + (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*', re.IGNORECASE), lambda match: ''), ] # Fix pdftohtml markup @@ -135,14 +135,14 @@ class HTMLConverter(object): (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''), # Remove
and replace

with

(re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), - (re.compile(r'(.*)', re.IGNORECASE), - lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 + (re.compile(r'(.*)', re.IGNORECASE), + lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 else match.group(1)), # Remove hyphenation (re.compile(r'-\n\r?'), lambda match: ''), - + ] - + # Fix Book Designer markup BOOK_DESIGNER = [ # HR @@ -161,23 +161,23 @@ class HTMLConverter(object): (re.compile('<]*?>( ){4}

', re.IGNORECASE), lambda match : '

'), ] - + def __hasattr__(self, attr): if hasattr(self.options, attr): return True return object.__hasattr__(self, attr) - + def __getattr__(self, attr): if hasattr(self.options, attr): return getattr(self.options, attr) return object.__getattribute__(self, attr) - + def __setattr__(self, attr, val): if hasattr(self.options, attr): setattr(self.options, attr, val) else: object.__setattr__(self, attr, val) - + CSS = { 'h1' : {"font-size" : "xx-large", "font-weight":"bold", 'text-indent':'0pt'}, 'h2' : {"font-size" : "x-large", "font-weight":"bold", 'text-indent':'0pt'}, @@ -201,27 +201,28 @@ class HTMLConverter(object): 'sup' : {'vertical-align': 'super', 'font-size': '60%'}, 'sub' : {'vertical-align': 'sub', 'font-size': '60%'}, } - + def __init__(self, book, fonts, options, logger, paths): ''' Convert HTML files at C{paths} and add to C{book}. After creating the object, you must call L{self.writeto} to output the LRF/S file. - - @param book: The LRF book + + @param book: The LRF book @type book: L{lrf.pylrs.Book} @param fonts: dict specifying the font families to use ''' - # Defaults for various formatting tags + # Defaults for various formatting tags object.__setattr__(self, 'options', options) + self.log = logger self.fonts = fonts #: dict specifying font families to use - # Memory - self.scaled_images = {} #: Temporary files with scaled version of images - self.rotated_images = {} #: Temporary files with rotated version of images + # Memory + self.scaled_images = {} #: Temporary files with scaled version of images + self.rotated_images = {} #: Temporary files with rotated version of images self.text_styles = [] #: Keep track of already used textstyles self.block_styles = [] #: Keep track of already used blockstyles self.images = {} #: Images referenced in the HTML document self.targets = {} #: and id elements - self.links = deque() #: elements + self.links = deque() #: elements self.processed_files = [] self.extra_toc_entries = [] #: TOC entries gleaned from semantic information self.image_memory = [] @@ -235,30 +236,30 @@ class HTMLConverter(object): self.preserve_block_style = False #: Used so that

tags in

elements are handled properly self.avoid_page_break = False self.current_page = book.create_page() - - # Styles - self.blockquote_style = book.create_block_style(sidemargin=60, + + # Styles + self.blockquote_style = book.create_block_style(sidemargin=60, topskip=20, footskip=20) self.unindented_style = book.create_text_style(parindent=0) - - + + self.in_table = False # List processing self.list_level = 0 self.list_indent = 20 self.list_counter = 1 - + self.book = book #: The Book object representing a BBeB book - + self.override_css = {} self.override_pcss = {} - + if self._override_css is not None: if os.access(self._override_css, os.R_OK): src = open(self._override_css, 'rb').read() else: src = self._override_css - match = self.PAGE_BREAK_PAT.search(src) + match = self.PAGE_BREAK_PAT.search(src) if match and not re.match('avoid', match.group(1), re.IGNORECASE): self.page_break_found = True ncss, npcss = self.parse_css(src) @@ -266,12 +267,12 @@ class HTMLConverter(object): update_css(ncss, self.override_css) if npcss: update_css(npcss, self.override_pcss) - - - + + + paths = [os.path.abspath(path) for path in paths] paths = [path.decode(sys.getfilesystemencoding()) if not isinstance(path, unicode) else path for path in paths] - + while len(paths) > 0 and self.link_level <= self.link_levels: for path in paths: if path in self.processed_files: @@ -287,62 +288,62 @@ class HTMLConverter(object): if link['path'] == path: self.links.remove(link) break - self.log_warn('Could not process '+path) + self.log.warn('Could not process '+path) if self.verbose: - self.log_exception(' ') + self.log.exception(' ') self.links = self.process_links() self.link_level += 1 paths = [link['path'] for link in self.links] - + if self.current_page is not None and self.current_page.has_text(): self.book.append(self.current_page) - + for text, tb in self.extra_toc_entries: self.book.addTocEntry(text, tb) - + if self.base_font_size > 0: - self.log_info('\tRationalizing font sizes...') + self.log.info('\tRationalizing font sizes...') self.book.rationalize_font_sizes(self.base_font_size) - + def is_baen(self, soup): - return bool(soup.find('meta', attrs={'name':'Publisher', + return bool(soup.find('meta', attrs={'name':'Publisher', 'content':re.compile('Baen', re.IGNORECASE)})) - + def is_book_designer(self, raw): return bool(re.search('<]*id=BookTitle', raw)) - + def preprocess(self, raw): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(HTMLConverter.MARKUP_MASSAGE) - + if not self.book_designer and self.is_book_designer(raw): self.book_designer = True - self.log_info(_('\tBook Designer file detected.')) - - self.log_info(_('\tParsing HTML...')) - + self.log.info(_('\tBook Designer file detected.')) + + self.log.info(_('\tParsing HTML...')) + if self.baen: nmassage.extend(HTMLConverter.BAEN) - + if self.pdftohtml: nmassage.extend(HTMLConverter.PDFTOHTML) if self.book_designer: nmassage.extend(HTMLConverter.BOOK_DESIGNER) try: - soup = BeautifulSoup(raw, + soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.XHTML_ENTITIES, markupMassage=nmassage) except ConversionError, err: if 'Failed to coerce to unicode' in str(err): raw = unicode(raw, 'utf8', 'replace') - soup = BeautifulSoup(raw, + soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.XHTML_ENTITIES, markupMassage=nmassage) else: raise if not self.baen and self.is_baen(soup): self.baen = True - self.log_info(_('\tBaen file detected. Re-parsing...')) + self.log.info(_('\tBaen file detected. Re-parsing...')) return self.preprocess(raw) if self.book_designer: t = soup.find(id='BookTitle') @@ -358,13 +359,13 @@ class HTMLConverter(object): try: dump = open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb') dump.write(unicode(soup).encode('utf-8')) - self.log_info(_('Written preprocessed HTML to ')+dump.name) + self.log.info(_('Written preprocessed HTML to ')+dump.name) dump.close() except: pass - + return soup - + def add_file(self, path): self.css = HTMLConverter.CSS.copy() self.pseudo_css = self.override_pcss.copy() @@ -373,13 +374,13 @@ class HTMLConverter(object): self.css[selector].update(self.override_css[selector]) else: self.css[selector] = self.override_css[selector] - + upath = path.encode(sys.getfilesystemencoding()) if isinstance(path, unicode) else path self.file_name = os.path.basename(upath.decode(sys.getfilesystemencoding())) - self.log_info(_('Processing %s'), repr(upath) if self.verbose else repr(self.file_name)) - + self.log.info(_('Processing %s')%( repr(upath) if self.verbose else repr(self.file_name))) + if not os.path.exists(upath): - upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names + upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names f = open(upath, 'rb') raw = f.read() if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files @@ -390,7 +391,7 @@ class HTMLConverter(object): raw = xml_to_unicode(raw, self.verbose)[0] f.close() soup = self.preprocess(raw) - self.log_info(_('\tConverting to BBeB...')) + self.log.info(_('\tConverting to BBeB...')) self.current_style = {} self.page_break_found = False if not isinstance(path, unicode): @@ -399,9 +400,9 @@ class HTMLConverter(object): self.previous_text = '\n' self.tops[path] = self.parse_file(soup) self.processed_files.append(path) - - - + + + def parse_css(self, style): """ Parse the contents of a - - -%(body)s - - -''' - res = [] - para = [] - styles = [] - for page in self.pages: - res.append(u''%page.id) - for group in page.groups: - if group.is_header or group.is_footer: - continue - if group.style is not None: - styles.append(u'.%s %s\n'%(group.id, group.style.to_css())) - for line in group.lines: - if line.is_para_start: - indent = group.left_margin - line.left - if para: - res.append(u'

%s

'%(indent, ''.join(para))) - para = [] - para.append(line.to_xhtml(group.id)) - if page.page_break_after: - res.append(u'
') - if para: - res.append(u'

%s

'%(''.join(para))) - para = [] - - return (header%dict(style='\n'.join(styles), body='\n'.join(res))).encode('utf-8') - -class PDFConverter(object): - - @classmethod - def generate_xml(cls, pathtopdf, logger): - pathtopdf = os.path.abspath(pathtopdf) - tdir = tempfile.mkdtemp('pdf2xml', __appname__) - atexit.register(shutil.rmtree, tdir) - xmlfile = os.path.basename(pathtopdf)+'.xml' - os.chdir(tdir) - cmd = PDFTOXML + ' -outline "%s" "%s"'%(pathtopdf, xmlfile) - p = subprocess.Popen(cmd, shell=True, stderr=subprocess.STDOUT, - stdout=subprocess.PIPE) - log = p.stdout.read() - ret = p.wait() - if ret != 0: - raise ConversionError, log - xmlfile = os.path.join(tdir, xmlfile) - if os.stat(xmlfile).st_size < 20: - raise ConversionError(os.path.basename(pathtopdf) + ' does not allow copying of text.') - return xmlfile - - - def __init__(self, pathtopdf, logger, opts): - self.cwd = os.getcwdu() - self.logger = logger - self.opts = opts - try: - self.logger.info('Converting PDF to XML') - self.xmlfile = self.generate_xml(pathtopdf, self.logger) - self.tdir = os.path.dirname(self.xmlfile) - self.data_dir = self.xmlfile + '_data' - outline_file = self.xmlfile.rpartition('.')[0]+'_outline.xml' - self.logger.info('Parsing XML') - self.document = PDFDocument(self.xmlfile) - self.outline = parse(outline_file) - finally: - os.chdir(self.cwd) - - def convert(self, output_dir): - doc = self.document.to_xhtml() - open(os.path.join(output_dir, 'document.html'), 'wb').write(doc) - - - -def option_parser(): - parser = OptionParser(usage=\ -''' -%prog [options] myfile.pdf - -Convert a PDF file to a HTML file. -''') - parser.add_option('-o', '--output-dir', default='.', - help=_('Path to output directory in which to create the HTML file. Defaults to current directory.')) - parser.add_option('--verbose', default=False, action='store_true', - help=_('Be more verbose.')) - return parser - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args() - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('pdf2html') - setup_cli_handlers(logger, level) - if len(args) != 1: - parser.print_help() - print _('You must specify a single PDF file.') - return 1 - options.output_dir = os.path.abspath(options.output_dir) - converter = PDFConverter(os.path.abspath(args[0]), logger, options) - converter.convert(options.output_dir) - - return 0 - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/txt/__init__.py b/src/calibre/ebooks/lrf/txt/__init__.py deleted file mode 100644 index c705e32a66..0000000000 --- a/src/calibre/ebooks/lrf/txt/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/txt/convert_from.py b/src/calibre/ebooks/lrf/txt/convert_from.py deleted file mode 100644 index 89441f9d6d..0000000000 --- a/src/calibre/ebooks/lrf/txt/convert_from.py +++ /dev/null @@ -1,112 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -""" -Convert .txt files to .lrf -""" -import os, sys, codecs, logging, re, shutil - -from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks import ConversionError -from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file -from calibre.ebooks.markdown import markdown -from calibre import setup_cli_handlers -from calibre.ebooks.metadata import MetaInformation -from calibre.ebooks.metadata.opf import OPFCreator - -def option_parser(): - parser = lrf_option_parser( -_('''%prog [options] mybook.txt - - -%prog converts mybook.txt to mybook.lrf''')) - parser.add_option('--debug-html-generation', action='store_true', default=False, - dest='debug_html_generation', help=_('Print generated HTML to stdout and quit.')) - return parser - -def fix_image_includes(sdir, tdir, match): - path = match.group(1).split('/') - src = os.path.join(sdir, *path) - dest = os.path.join(tdir, *path) - p = os.path.dirname(dest) - if not os.path.exists(p): - os.makedirs(p) - if not os.path.exists(dest): - shutil.copyfile(src, dest) - - -def generate_html(txtfile, encoding, tdir): - ''' - Convert txtfile to html and return a PersistentTemporaryFile object pointing - to the file with the HTML. - ''' - txtfile = os.path.abspath(txtfile) - enc = encoding - if not encoding: - encodings = ['cp1252', 'latin-1', 'utf8', 'iso-8859-1', 'koi8_r', 'koi8_u'] - txt, enc = None, None - for encoding in encodings: - try: - txt = codecs.open(txtfile, 'rb', encoding).read() - except UnicodeDecodeError: - continue - enc = encoding - break - if txt == None: - raise ConversionError, 'Could not detect encoding of %s'%(txtfile,) - else: - txt = codecs.open(txtfile, 'rb', enc).read() - - print 'Converting text to HTML...' - md = markdown.Markdown( - extensions=['footnotes', 'tables', 'toc'], - safe_mode=False, - ) - html = ''+md.convert(txt)+'' - for match in re.finditer(r']*src="([^"]+)"', html): - fix_image_includes(os.path.dirname(txtfile), tdir, match) - p = os.path.join(tdir, 'index.html') - open(p, 'wb').write(html.encode('utf-8')) - mi = MetaInformation(os.path.splitext(os.path.basename(txtfile))[0], [_('Unknown')]) - opf = OPFCreator(tdir, mi) - opf.create_manifest([(os.path.join(tdir, 'index.html'), None)]) - opf.create_spine([os.path.join(tdir, 'index.html')]) - opf.render(open(os.path.join(tdir, 'metadata.opf'), 'wb')) - return p - -def process_file(path, options, logger=None): - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('txt2lrf') - setup_cli_handlers(logger, level) - txt = os.path.abspath(os.path.expanduser(path)) - if not hasattr(options, 'debug_html_generation'): - options.debug_html_generation = False - tdir = PersistentTemporaryDirectory('_txt2lrf') - htmlfile = generate_html(txt, options.encoding, tdir) - options.encoding = 'utf-8' - if not options.debug_html_generation: - options.force_page_break = 'h2' - if not options.output: - ext = '.lrs' if options.lrs else '.lrf' - options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext) - options.output = os.path.abspath(os.path.expanduser(options.output)) - if not options.title: - options.title = os.path.splitext(os.path.basename(path))[0] - html_process_file(htmlfile, options, logger) - else: - print open(htmlfile, 'rb').read() - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - print - print 'No txt file specified' - return 1 - process_file(args[1], options, logger) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lrf/txt/demo/demo.txt b/src/calibre/ebooks/lrf/txt/demo/demo.txt deleted file mode 100644 index af4139241b..0000000000 --- a/src/calibre/ebooks/lrf/txt/demo/demo.txt +++ /dev/null @@ -1,89 +0,0 @@ -Demonstration of `txt2lrf` -========================== - -`txt2lrf` provides a convenient way to create LRF files with good formatting. -`txt2lrf` recognizes a simple markup language called *markdown*. - -The idea is to provide a lightweight markup that can be used to create -TXT files that can be read by themselves or automatically converted to LRF. -[{@name=toc}]() - -

- -///Table of Contents/// - - -Text formatting ---------------- -**Bold** and *italic* text is easily specified. - -> Blockquotes are also very simple to specify. -> This is a basic blockquote paragraph. I absolutely -> love block quotes don't you? - - This is a preformatted code block. No formatting rules are applied to text in this block and it is rendered in a monospaced font. - - -For details on the text formatting syntax visit - - http://daringfireball.net/projects/markdown/syntax -___ -[Table of Contents](#toc) - -Lists ------ -Both ordered and unordered lists are supported. - - -### Unordered lists - -+ What a -+ *nice* -+ list - - - -### Ordered lists - -1. One -2. Two -3. Three - -**Note:** Nested lists are not supported - -___ -[Table of Contents](#toc) - -Tables ------- - -Simple tables are easily generated - -| |* Col 1 *|* Col 2 *| -|* Row 1 *| (1, 1) | (1, 2) | -|* Row 2 *| (2, 1) | (2, 2) | - -**Note:** Nested tables are not supported - -___ -[Table of Contents](#toc) - -Images ------- - -`txt2lrf` also has support for inline images like -![this one](small.jpg) this one. - -___ -[Table of Contents](#toc) - -Automatic TOC Creation ----------------------- - -By inserting `///Table of Contents///` into the text at some point -a table of contents is automatically generated with links that point -to all headings underlined with `-------`. - -___ -[Table of Contents](#toc) - diff --git a/src/calibre/ebooks/lrf/txt/demo/small.jpg b/src/calibre/ebooks/lrf/txt/demo/small.jpg deleted file mode 100644 index 6dae5fde42..0000000000 Binary files a/src/calibre/ebooks/lrf/txt/demo/small.jpg and /dev/null differ diff --git a/src/calibre/ebooks/lrf/web/__init__.py b/src/calibre/ebooks/lrf/web/__init__.py deleted file mode 100644 index c25b6259a8..0000000000 --- a/src/calibre/ebooks/lrf/web/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' - - -builtin_profiles = [] -available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] diff --git a/src/calibre/ebooks/lrf/web/convert_from.py b/src/calibre/ebooks/lrf/web/convert_from.py deleted file mode 100644 index ca523e869b..0000000000 --- a/src/calibre/ebooks/lrf/web/convert_from.py +++ /dev/null @@ -1,183 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -'''Convert websites into LRF files.''' - -import sys, tempfile, shutil, os, logging, imp, inspect, re -from urlparse import urlsplit - -from calibre import __appname__, setup_cli_handlers, CommandLineError, strftime -from calibre.ebooks.lrf import option_parser as lrf_option_parser -from calibre.ebooks.lrf.html.convert_from import process_file - -from calibre.web.fetch.simple import create_fetcher - -from calibre.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile, create_class -from calibre.ebooks.lrf.web import builtin_profiles, available_profiles - - -def option_parser(): - parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n''' - '''%prog downloads a site from the web and converts it ''' - '''into a LRF file for use with the SONY Reader. ''' - '''website_profile is one of '''+str(available_profiles)+\ - ''' If you specify a website_profile of default or do not specify ''' - '''it, you must specify the --url option.''' - ) - - parser.add_option('-u', '--url', dest='url', default=None, - help='The URL to download. You only need to specify this if you are not specifying a website_profile.') - parser.add_option('--user-profile', default=None, - help='Path to a python file containing a user created profile. For help visit http://%s.kovidgoyal.net/wiki/UserProfiles'%__appname__) - parser.add_option('--username', dest='username', default=None, - help='Specify the username to be used while downloading. Only used if the profile supports it.') - parser.add_option('--password', dest='password', default=None, - help='Specify the password to be used while downloading. Only used if the profile supports it.') - parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout, - default=None, type='int', dest='timeout') - parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout, - default=None, type='int', dest='max_recursions') - parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files', - help='The maximum number of files to download. This only applies to files from
tags. Default is %d'%DefaultProfile.timeout) - parser.add_option('--delay', default=None, dest='delay', type='int', - help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout) - parser.add_option('--dont-download-stylesheets', action='store_true', default=None, - help='Do not download CSS stylesheets.', dest='no_stylesheets') - parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append', - help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.') - parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps', - help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.') - parser.add_option('--keep-downloaded-files', default=False, action='store_true', - help='''Do not delete the downloaded files after creating the LRF''') - return parser - -def fetch_website(options, logger): - tdir = tempfile.mkdtemp(prefix=__appname__+'_', suffix='_web2lrf') - options.dir = tdir - fetcher = create_fetcher(options, logger) - fetcher.preprocess_regexps = options.preprocess_regexps - return fetcher.start_fetch(options.url), tdir - -def create_lrf(htmlfile, options, logger): - if not options.author or options.author.lower() == 'unknown': - options.author = __appname__ - options.header = True - if options.output: - options.output = os.path.abspath(os.path.expanduser(options.output)) - else: - options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf'))) - - process_file(htmlfile, options, logger) - -def process_profile(args, options, logger=None): - tdir = None - try: - if logger is None: - level = logging.DEBUG if options.verbose else logging.INFO - logger = logging.getLogger('web2lrf') - setup_cli_handlers(logger, level) - index = -1 - - if len(args) == 2 and re.search(r'class\s+\S+\(\S+\)\s*\:', args[1]): - profile = create_class(args[1]) - else: - if options.user_profile is not None: - path = os.path.abspath(options.user_profile) - name = os.path.splitext(os.path.basename(path))[0] - res = imp.find_module(name, [os.path.dirname(path)]) - module = imp.load_module(name, *res) - classes = inspect.getmembers(module, - lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\ - and x is not DefaultProfile and x is not FullContentProfile) - if not classes: - raise CommandLineError('Invalid user profile '+path) - builtin_profiles.append(classes[0][1]) - available_profiles.append(name) - if len(args) < 2: - args.append(name) - args[1] = name - index = -1 - if len(args) == 2: - try: - if isinstance(args[1], basestring): - if args[1] != 'default': - index = available_profiles.index(args[1]) - except ValueError: - raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles)) - else: - raise CommandLineError('Only one profile at a time is allowed.') - profile = DefaultProfile if index == -1 else builtin_profiles[index] - - - - profile = profile(logger, options.verbose, options.username, options.password) - if profile.browser is not None: - options.browser = profile.browser - - for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'): - val = getattr(options, opt) - if val is None: - setattr(options, opt, getattr(profile, opt)) - - if not options.url: - options.url = profile.url - - if not options.url: - raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,)) - - if not options.title: - title = profile.title - if not title: - title = urlsplit(options.url).netloc - options.title = title + strftime(profile.timefmt) - - options.match_regexps += profile.match_regexps - options.preprocess_regexps = profile.preprocess_regexps - options.filter_regexps += profile.filter_regexps - - options.encoding = profile.encoding if options.encoding is None else options.encoding - - if len(args) == 2 and args[1] != 'default': - options.anchor_ids = False - - htmlfile, tdir = fetch_website(options, logger) - options.encoding = 'utf-8' - cwd = os.getcwd() - if not options.output: - title = options.title.encode(sys.getfilesystemencoding()) if isinstance(options.title, unicode) else options.title - options.output = os.path.join(cwd, options.title+('.lrs' if options.lrs else '.lrf')) - if not os.path.isabs(options.output): - options.output = os.path.join(cwd, options.output) - - option_parser().parse_args(profile.html2lrf_options, options) - - try: - os.chdir(os.path.dirname(htmlfile)) - create_lrf(os.path.basename(htmlfile), options, logger) - finally: - os.chdir(cwd) - finally: - try: - profile.cleanup() - except: - pass - if tdir and os.path.isdir(tdir): - if options.keep_downloaded_files: - print 'Downloaded files in ', tdir - else: - shutil.rmtree(tdir) - - -def main(args=sys.argv, logger=None): - parser = option_parser() - options, args = parser.parse_args(args) - if len(args) > 2 or (len(args) == 1 and not options.user_profile): - parser.print_help() - return 1 - try: - process_profile(args, options, logger=logger) - except CommandLineError, err: - print >>sys.stderr, err - return 0 - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/web/profiles/__init__.py b/src/calibre/ebooks/lrf/web/profiles/__init__.py deleted file mode 100644 index 9544cad7c3..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/__init__.py +++ /dev/null @@ -1,572 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -Contains the Base Profiles that can be used to easily create profiles to download -particular websites. -''' - -import tempfile, time, calendar, re, operator, atexit, shutil, os -from htmlentitydefs import name2codepoint -from email.utils import formatdate - -from calibre import __appname__, iswindows, browser, strftime -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, CData, Tag - - -class DefaultProfile(object): - - #: The title to use for the LRF file - #: @type: string - title = 'Default Profile' - - #: Maximum number of articles to download from each feed - #: @type: integer - max_articles_per_feed = 10 - - #: If True process the element of the feed as HTML - #: @type: boolean - html_description = True - - #: How many days old should the oldest article downloaded from the feeds be - #: @type: integer - oldest_article = 7 - - #: Recommend frequency at which to download this profile. In days. - recommended_frequency = 7 - - #: Number of levels of links to follow - #: @type: integer - max_recursions = 1 - - #: Maximum number of files to download - #: @type: integer - max_files = 3000 - - #: Delay between consecutive downloads in seconds - #: @type: integer - delay = 0 - - #: Timeout for fetching files from server in seconds - #: @type: integer - timeout = 10 - - #: The format string for the date shown on the first page - #: @type: string - timefmt = ' [%a %d %b %Y]' - - #: The order of elements to search for a URL when parsing the RSS feed. You - #: can replace these elements by completely arbitrary elements to customize - #: feed processing. - #: @type: list of strings - url_search_order = ['guid', 'link'] - - #: The format string used to parse the publication date in the RSS feed. - #: If set to None some default heuristics are used, these may fail, - #: in which case set this to the correct string or re-implement - #: L{DefaultProfile.strptime} in your subclass. - #: @type: string or None - pubdate_fmt = None - - #: If True will look for a publication date for each article. - #: If False assumes the publication date is the current time. - #: @type: boolean - use_pubdate = True, - - #: Max number of characters in the short description. - #: Used by L{FullContentProfile} - #: @type: integer - summary_length = 500 - - #: If True stylesheets are not downloaded and processed - #: Convenient flag to disable loading of stylesheets for websites - #: that have overly complex stylesheets unsuitable for conversion - #: to ebooks formats - #: @type: boolean - no_stylesheets = False - - #: If False articles with the same title in the same feed - #: are not downloaded multiple times - #: @type: boolean - allow_duplicates = False - - #: If True the GUI will ask the user for a username and password - #: to use while downloading - #: @type: boolean - needs_subscription = False - - #: Specify an override encoding for sites that have an incorrect - #: charset specification. THe most common being specifying latin1 and - #: using cp1252 - encoding = None - - #: List of regular expressions that determines which links to follow - #: If empty, it is ignored. - #: Only one of L{match_regexps} or L{filter_regexps} should be defined - #: @type: list of strings - match_regexps = [] - - #: List of regular expressions that determines which links to ignore - #: If empty it is ignored - #: Only one of L{match_regexps} or L{filter_regexps} should be defined - #: @type: list of strings - filter_regexps = [] - - #: List of options to pass to html2lrf, to customize conversion - #: to LRF - #: @type: list of strings - html2lrf_options = [] - - #: List of regexp substitution rules to run on the downloaded HTML. Each element of the - #: list should be a two element tuple. The first element of the tuple should - #: be a compiled regular expression and the second a callable that takes - #: a single match object and returns a string to replace the match. - #: @type: list of tuples - preprocess_regexps = [] - - # See the built-in profiles for examples of these settings. - - #: The URL of the website - #: @type: string - url = '' - - feeds = [] - CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL) - - def get_feeds(self): - ''' - Return a list of RSS feeds to fetch for this profile. Each element of the list - must be a 2-element tuple of the form (title, url). - ''' - if not self.feeds: - raise NotImplementedError - return self.feeds - - @classmethod - def print_version(cls, url): - ''' - Take a URL pointing to an article and returns the URL pointing to the - print version of the article. - ''' - return url - - @classmethod - def get_browser(cls): - ''' - Return a browser instance used to fetch documents from the web. - - If your profile requires that you login first, override this method - in your subclass. See for example the nytimes profile. - ''' - return browser() - - - - - def __init__(self, logger, verbose=False, username=None, password=None, lrf=True): - self.logger = logger - self.username = username - self.password = password - self.verbose = verbose - self.lrf = lrf - self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_') - self.browser = self.get_browser() - try: - self.url = 'file:'+ ('' if iswindows else '//') + self.build_index() - except NotImplementedError: - self.url = None - atexit.register(cleanup, self.temp_dir) - - def build_index(self): - '''Build an RSS based index.html''' - articles = self.parse_feeds() - encoding = 'utf-8' if self.encoding is None else self.encoding - def build_sub_index(title, items): - ilist = '' - li = u'
  • %(title)s [%(date)s]
    \n'+\ - u'
    %(description)s
  • \n' - for item in items: - if not item.has_key('date'): - item['date'] = time.strftime('%a, %d %b', time.localtime()) - ilist += li%item - return u'''\ - - -

    %(title)s

    -
      - %(items)s -
    - - - '''%dict(title=title, items=ilist.rstrip()) - - cnum = 0 - clist = '' - categories = articles.keys() - categories.sort() - for category in categories: - cnum += 1 - cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html') - prefix = 'file:' if iswindows else '' - clist += u'
  • %s
  • \n'%(prefix+cfile, category) - src = build_sub_index(category, articles[category]) - open(cfile, 'wb').write(src.encode(encoding)) - - title = self.title - if not isinstance(title, unicode): - title = unicode(title, 'utf-8', 'replace') - src = u'''\ - - -

    %(title)s

    -
    %(date)s
    -
      - %(categories)s -
    - - - '''%dict(date=strftime('%a, %d %B, %Y'), - categories=clist, title=title) - index = os.path.join(self.temp_dir, 'index.html') - open(index, 'wb').write(src.encode(encoding)) - - return index - - - @classmethod - def tag_to_string(cls, tag, use_alt=True): - ''' - Convenience method to take a BeautifulSoup Tag and extract the text from it - recursively, including any CDATA sections and alt tag attributes. - @param use_alt: If True try to use the alt attribute for tags that don't have any textual content - @type use_alt: boolean - @return: A unicode (possibly empty) object - @rtype: unicode string - ''' - if not tag: - return '' - if isinstance(tag, basestring): - return tag - strings = [] - for item in tag.contents: - if isinstance(item, (NavigableString, CData)): - strings.append(item.string) - elif isinstance(item, Tag): - res = cls.tag_to_string(item) - if res: - strings.append(res) - elif use_alt and item.has_key('alt'): - strings.append(item['alt']) - return u''.join(strings) - - def get_article_url(self, item): - ''' - Return the article URL given an item Tag from a feed, or None if no valid URL is found - @type item: BeatifulSoup.Tag - @param item: A BeautifulSoup Tag instance corresponding to the tag from a feed. - @rtype: string or None - ''' - url = None - for element in self.url_search_order: - url = item.find(element.lower()) - if url: - break - return url - - - def parse_feeds(self, require_url=True): - ''' - Create list of articles from a list of feeds. - @param require_url: If True skip articles that don't have a link to a HTML page with the full article contents. - @type require_url: boolean - @rtype: dictionary - @return: A dictionary whose keys are feed titles and whose values are each - a list of dictionaries. Each list contains dictionaries of the form:: - { - 'title' : article title, - 'url' : URL of print version, - 'date' : The publication date of the article as a string, - 'description' : A summary of the article - 'content' : The full article (can be an empty string). This is used by FullContentProfile - } - ''' - added_articles = {} - feeds = self.get_feeds() - articles = {} - for title, url in feeds: - try: - src = self.browser.open(url).read() - except Exception, err: - self.logger.error('Could not fetch feed: %s\nError: %s'%(url, err)) - if self.verbose: - self.logger.exception(' ') - continue - - articles[title] = [] - added_articles[title] = [] - soup = BeautifulStoneSoup(src) - for item in soup.findAll('item'): - try: - atitle = item.find('title') - if not atitle: - continue - - atitle = self.tag_to_string(atitle) - if self.use_pubdate: - pubdate = item.find('pubdate') - if not pubdate: - pubdate = item.find('dc:date') - if not pubdate or not pubdate.string: - pubdate = formatdate() - pubdate = self.tag_to_string(pubdate) - pubdate = pubdate.replace('+0000', 'GMT') - - - url = self.get_article_url(item) - url = self.tag_to_string(url) - if require_url and not url: - self.logger.debug('Skipping article %s as it does not have a link url'%atitle) - continue - purl = url - try: - purl = self.print_version(url) - except Exception, err: - self.logger.debug('Skipping %s as could not find URL for print version. Error:\n%s'%(url, err)) - continue - - content = item.find('content:encoded') - if not content: - content = item.find('description') - if content: - content = self.process_html_description(content, strip_links=False) - else: - content = '' - - d = { - 'title' : atitle, - 'url' : purl, - 'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(), - 'date' : pubdate if self.use_pubdate else formatdate(), - 'content' : content, - } - delta = time.time() - d['timestamp'] - if not self.allow_duplicates: - if d['title'] in added_articles[title]: - continue - added_articles[title].append(d['title']) - if delta > self.oldest_article*3600*24: - continue - - except Exception, err: - if self.verbose: - self.logger.exception('Error parsing article:\n%s'%(item,)) - continue - try: - desc = '' - for c in item.findAll('description'): - desc = self.tag_to_string(c) - if desc: - break - d['description'] = self.process_html_description(desc) if self.html_description else desc.string - except: - d['description'] = '' - articles[title].append(d) - articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True) - articles[title] = articles[title][:self.max_articles_per_feed+1] - #for item in articles[title]: - # item.pop('timestamp') - if not articles[title]: - articles.pop(title) - return articles - - - def cleanup(self): - ''' - Called after LRF file has been generated. Use it to do any cleanup like - logging out of subscription sites, etc. - ''' - pass - - @classmethod - def process_html_description(cls, tag, strip_links=True): - ''' - Process a tag that contains HTML markup, either - entity encoded or escaped in a CDATA section. - @return: HTML - @rtype: string - ''' - src = '\n'.join(tag.contents) if hasattr(tag, 'contents') else tag - match = cls.CDATA_PAT.match(src.lstrip()) - if match: - src = match.group(1) - else: - replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] - for e in replaced_entities: - ent = '&'+e+';' - src = src.replace(ent, unichr(name2codepoint[e])) - if strip_links: - src = re.compile(r'(.*?)', re.IGNORECASE|re.DOTALL).sub(r'\1', src) - - return src - - - DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6) - FULL_DAY_MAP = dict(Sunday=0, Monday=1, Tueday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6) - MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12) - FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6, - July=7, August=8, September=9, October=10, - November=11, December=12) - - @classmethod - def strptime(cls, src): - ''' - Take a string and return the date that string represents, in UTC as - an epoch (i.e. number of seconds since Jan 1, 1970). This function uses - a bunch of heuristics and is a prime candidate for being overridden in a - subclass. - @param src: Timestamp as a string - @type src: string - @return: time ans a epoch - @rtype: number - ''' - delta = 0 - zone = re.search(r'\s*(\+\d\d\:{0,1}\d\d)', src) - if zone: - delta = zone.group(1) - hrs, mins = int(delta[1:3]), int(delta[-2:].rstrip()) - delta = 60*(hrs*60 + mins) * (-1 if delta.startswith('-') else 1) - src = src.replace(zone.group(), '') - if cls.pubdate_fmt is None: - src = src.strip().split() - try: - src[0] = str(cls.DAY_MAP[src[0][:-1]])+',' - except KeyError: - src[0] = str(cls.FULL_DAY_MAP[src[0][:-1]])+',' - try: - src[2] = str(cls.MONTH_MAP[src[2]]) - except KeyError: - src[2] = str(cls.FULL_MONTH_MAP[src[2]]) - fmt = '%w, %d %m %Y %H:%M:%S' - src = src[:5] # Discard extra information - try: - time_t = time.strptime(' '.join(src), fmt) - except ValueError: - time_t = time.strptime(' '.join(src), fmt.replace('%Y', '%y')) - return calendar.timegm(time_t)-delta - else: - return calendar.timegm(time.strptime(src, cls.pubdate_fmt)) - - def command_line_options(self): - args = [] - args.append('--max-recursions='+str(self.max_recursions)) - args.append('--delay='+str(self.delay)) - args.append('--max-files='+str(self.max_files)) - for i in self.match_regexps: - args.append('--match-regexp="'+i+'"') - for i in self.filter_regexps: - args.append('--filter-regexp="'+i+'"') - return args - - -class FullContentProfile(DefaultProfile): - ''' - This profile is designed for feeds that embed the full article content in the RSS file. - ''' - - max_recursions = 0 - article_counter = 0 - - - def build_index(self): - '''Build an RSS based index.html. ''' - articles = self.parse_feeds(require_url=False) - - def build_sub_index(title, items): - ilist = '' - li = u'
  • %(title)s [%(date)s]
    \n'+\ - u'
    %(description)s
  • \n' - for item in items: - content = item['content'] - if not content: - self.logger.debug('Skipping article as it has no content:%s'%item['title']) - continue - item['description'] = cutoff(item['description'], self.summary_length)+'…' - self.article_counter = self.article_counter + 1 - url = os.path.join(self.temp_dir, 'article%d.html'%self.article_counter) - item['url'] = url - open(url, 'wb').write((u'''\ - - -

    %s

    -
    - %s -
    - - '''%(item['title'], content)).encode('utf-8') - ) - ilist += li%item - return u'''\ - - -

    %(title)s

    -
      - %(items)s -
    - - - '''%dict(title=title, items=ilist.rstrip()) - - cnum = 0 - clist = '' - categories = articles.keys() - categories.sort() - for category in categories: - cnum += 1 - cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html') - prefix = 'file:' if iswindows else '' - clist += u'
  • %s
  • \n'%(prefix+cfile, category) - src = build_sub_index(category, articles[category]) - open(cfile, 'wb').write(src.encode('utf-8')) - - src = '''\ - - -

    %(title)s

    -
    %(date)s
    -
      - %(categories)s -
    - - - '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), - categories=clist, title=self.title) - index = os.path.join(self.temp_dir, 'index.html') - open(index, 'wb').write(src.encode('utf-8')) - return index - -def cutoff(src, pos, fuzz=50): - si = src.find(';', pos) - if si > 0 and si-pos > fuzz: - si = -1 - gi = src.find('>', pos) - if gi > 0 and gi-pos > fuzz: - gi = -1 - npos = max(si, gi) - if npos < 0: - npos = pos - return src[:npos+1] - -def create_class(src): - environment = {'FullContentProfile':FullContentProfile, 'DefaultProfile':DefaultProfile} - exec src in environment - for item in environment.values(): - if hasattr(item, 'build_index'): - if item.__name__ not in ['DefaultProfile', 'FullContentProfile']: - return item - -def cleanup(tdir): - try: - if os.path.isdir(tdir): - shutil.rmtree(tdir) - except: - pass - \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/web/profiles/ap.py b/src/calibre/ebooks/lrf/web/profiles/ap.py deleted file mode 100644 index 161699941a..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/ap.py +++ /dev/null @@ -1,38 +0,0 @@ -import re -from calibre.ebooks.lrf.web.profiles import DefaultProfile - - -class AssociatedPress(DefaultProfile): - - title = 'Associated Press' - max_recursions = 2 - max_articles_per_feed = 15 - html2lrf_options = ['--force-page-break-before-tag="chapter"'] - - - preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in -[ - (r'.*?' , lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'
    .*?

    ', lambda match : '

    '), - (r'

    ', lambda match : '

    '), - (r'Learn more about our Privacy Policy.*?', lambda match : ''), - ] - ] - - - - def get_feeds(self): - return [ ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'), - ('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'), - ('AP World News', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=SCAND&SECTION=HOME'), - ('AP Political News', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml?SITE=ORMED&SECTION=HOME'), - ('AP Washington State News', 'http://hosted.ap.org/lineups/WASHINGTONHEADS-rss_2.0.xml?SITE=NYPLA&SECTION=HOME'), - ('AP Technology News', 'http://hosted.ap.org/lineups/TECHHEADS-rss_2.0.xml?SITE=CTNHR&SECTION=HOME'), - ('AP Health News', 'http://hosted.ap.org/lineups/HEALTHHEADS-rss_2.0.xml?SITE=FLDAY&SECTION=HOME'), - ('AP Science News', 'http://hosted.ap.org/lineups/SCIENCEHEADS-rss_2.0.xml?SITE=OHCIN&SECTION=HOME'), - ('AP Strange News', 'http://hosted.ap.org/lineups/STRANGEHEADS-rss_2.0.xml?SITE=WCNC&SECTION=HOME'), - ] \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/web/profiles/atlantic.py b/src/calibre/ebooks/lrf/web/profiles/atlantic.py deleted file mode 100644 index eebbe84d96..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/atlantic.py +++ /dev/null @@ -1,47 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -import re -from calibre.ebooks.lrf.web.profiles import DefaultProfile -from calibre.ebooks.BeautifulSoup import BeautifulSoup - -class Atlantic(DefaultProfile): - - title = 'The Atlantic' - max_recursions = 2 - INDEX = 'http://www.theatlantic.com/doc/current' - - preprocess_regexps = [ - (re.compile(r'

    .*?<\!--\s+INVISIBLE SKIP .*?\s+-->', - lambda match : ''), - (r'', lambda match: ''), - ] - ] - - def __init__(self, logger, verbose=False, username=None, password=None): - DefaultProfile.__init__(self, username, password) - self.browser = None # Needed as otherwise there are timeouts while fetching actual articles - - def print_version(self, url): - return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '') - - def get_feeds(self): - src = self.browser.open('http://economist.com/rss/').read() - soup = BeautifulSoup(src) - feeds = [] - for ul in soup.findAll('ul'): - lis = ul.findAll('li') - try: - title, link = lis[0], lis[1] - except IndexError: - continue - title = title.string - if title: - title = title.strip() - if title not in self.__class__.TITLES: - continue - a = link.find('a') - feeds.append((title, a['href'].strip())) - - return feeds diff --git a/src/calibre/ebooks/lrf/web/profiles/faznet.py b/src/calibre/ebooks/lrf/web/profiles/faznet.py deleted file mode 100644 index 53f2cde752..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/faznet.py +++ /dev/null @@ -1,28 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -Profile to download FAZ.net -''' -import re - -from calibre.ebooks.lrf.web.profiles import DefaultProfile - -class FazNet(DefaultProfile): - - title = 'FAZ NET' - max_recursions = 2 - html_description = True - max_articles_per_feed = 30 - - preprocess_regexps = [ - (re.compile(r'Zum Thema.*?', re.IGNORECASE | re.DOTALL), - lambda match : ''), - ] - - - def get_feeds(self): - return [ ('FAZ.NET', 'http://www.faz.net/s/Rub/Tpl~Epartner~SRss_.xml') ] - - def print_version(self, url): - return url.replace('.html?rss_aktuell', '~Afor~Eprint.html') - diff --git a/src/calibre/ebooks/lrf/web/profiles/jpost.py b/src/calibre/ebooks/lrf/web/profiles/jpost.py deleted file mode 100644 index ddc2a00e35..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/jpost.py +++ /dev/null @@ -1,36 +0,0 @@ -import re -from calibre.ebooks.lrf.web.profiles import DefaultProfile - -class JerusalemPost(DefaultProfile): - - title = 'Jerusalem Post' - max_recursions = 2 - max_articles_per_feed = 10 - - - - preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in -[ - (r'.*?' , lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'
    ', lambda match : ''), - (r'\'NWAnews.com', lambda match : ''), - (r'', lambda match : ''), - (r'

    .*?', lambda match : ''), - - ] - ] - - def get_feeds(self): - return [ ('Front Page', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333346'), - ('Israel News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156'), - ('Middle East News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498'), - ('International News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144'), - ('Editorials', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333211'), - ] - - def print_version(self, url): - return ('http://www.jpost.com/servlet/Satellite?cid=' + url.rpartition('&')[2] + '&pagename=JPost%2FJPArticle%2FPrinter') - diff --git a/src/calibre/ebooks/lrf/web/profiles/jutarnji.py b/src/calibre/ebooks/lrf/web/profiles/jutarnji.py deleted file mode 100644 index 93da341edd..0000000000 --- a/src/calibre/ebooks/lrf/web/profiles/jutarnji.py +++ /dev/null @@ -1,44 +0,0 @@ -''' - Profile to download Jutarnji.hr by Valloric -''' - -import re - -from calibre.ebooks.lrf.web.profiles import DefaultProfile - -class Jutarnji(DefaultProfile): - - title = 'Jutarnji' - max_recursions = 2 - timefmt = ' [%d %b %Y]' - max_articles_per_feed = 80 - html_description = True - no_stylesheets = True - - preprocess_regexps = [ - (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match : ''), - (re.compile(r'

    .*?', re.IGNORECASE | re.DOTALL), lambda match : '
    '), - (re.compile(r')|(
    )|(
    )|(

    )|())', lambda match: '

    '), - - ## Remove any links/ads/comments/cruft from the end of the body of the article. - (r'(()|(
    )|(

    ©)|(