mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
LRF Output
This commit is contained in:
parent
3a99f99104
commit
538d310bb8
@ -291,6 +291,7 @@ from calibre.web.feeds.input import RecipeInput
|
||||
from calibre.ebooks.oeb.output import OEBOutput
|
||||
from calibre.ebooks.epub.output import EPUBOutput
|
||||
from calibre.ebooks.mobi.output import MOBIOutput
|
||||
from calibre.ebooks.lrf.output import LRFOutput
|
||||
from calibre.ebooks.txt.output import TXTOutput
|
||||
from calibre.ebooks.pdf.output import PDFOutput
|
||||
from calibre.ebooks.pml.input import PMLInput
|
||||
@ -310,7 +311,7 @@ from calibre.devices.jetbook.driver import JETBOOK
|
||||
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput,
|
||||
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput,
|
||||
FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput,
|
||||
PMLOutput, MOBIOutput]
|
||||
PMLOutput, MOBIOutput, LRFOutput]
|
||||
plugins += [PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY,
|
||||
EB600, JETBOOK]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
|
@ -236,7 +236,6 @@ OptionRecommendation(name='page_breaks_before',
|
||||
'before the specified elements.')
|
||||
),
|
||||
|
||||
|
||||
OptionRecommendation(name='margin_top',
|
||||
recommended_value=5.0, level=OptionRecommendation.LOW,
|
||||
help=_('Set the top margin in pts. Default is %default. '
|
||||
@ -614,11 +613,18 @@ OptionRecommendation(name='list_recipes',
|
||||
if self.opts.extra_css and os.path.exists(self.opts.extra_css):
|
||||
self.opts.extra_css = open(self.opts.extra_css, 'rb').read()
|
||||
|
||||
oibl = self.opts.insert_blank_line
|
||||
orps = self.opts.remove_paragraph_spacing
|
||||
if self.output_plugin.file_type == 'lrf':
|
||||
self.opts.insert_blank_line = False
|
||||
self.opts.remove_paragraph_spacing = False
|
||||
flattener = CSSFlattener(fbase=fbase, fkey=fkey,
|
||||
lineh=self.opts.line_height,
|
||||
untable=self.output_plugin.file_type in ('mobi','lit'),
|
||||
unfloat=self.output_plugin.file_type in ('mobi', 'lit'))
|
||||
flattener(self.oeb, self.opts)
|
||||
self.opts.insert_blank_line = oibl
|
||||
self.opts.remove_paragraph_spacing = orps
|
||||
|
||||
if self.opts.linearize_tables and \
|
||||
self.output_plugin.file_type not in ('mobi', 'lrf'):
|
||||
|
@ -1,43 +1,19 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
"""
|
||||
This package contains logic to read and write LRF files.
|
||||
The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}.
|
||||
"""
|
||||
import sys, os
|
||||
from optparse import OptionValueError
|
||||
from htmlentitydefs import name2codepoint
|
||||
This package contains logic to read and write LRF files.
|
||||
The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}.
|
||||
"""
|
||||
from uuid import uuid4
|
||||
|
||||
from calibre.ebooks.lrf.pylrs.pylrs import Book as _Book
|
||||
from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, PutObj, \
|
||||
Paragraph, TextStyle, BlockStyle
|
||||
from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, \
|
||||
TextStyle, BlockStyle
|
||||
from calibre.ebooks.lrf.fonts import FONT_FILE_MAP
|
||||
from calibre.ebooks import ConversionError
|
||||
from calibre import __appname__, __version__, __author__, iswindows
|
||||
from calibre.utils.config import OptionParser
|
||||
|
||||
__docformat__ = "epytext"
|
||||
|
||||
preferred_source_formats = [
|
||||
'LIT',
|
||||
'MOBI',
|
||||
'EPUB',
|
||||
'ODT',
|
||||
'HTML',
|
||||
'HTM',
|
||||
'XHTM',
|
||||
'XHTML',
|
||||
'PRC',
|
||||
'AZW',
|
||||
'FB2',
|
||||
'RTF',
|
||||
'PDF',
|
||||
'TXT',
|
||||
'ZIP',
|
||||
'RAR'
|
||||
]
|
||||
|
||||
class LRFParseError(Exception):
|
||||
pass
|
||||
|
||||
@ -55,174 +31,8 @@ class PRS500_PROFILE(object):
|
||||
header_height = 30 #: In px
|
||||
default_fonts = { 'sans': "Swis721 BT Roman", 'mono': "Courier10 BT Roman",
|
||||
'serif': "Dutch801 Rm BT Roman"}
|
||||
|
||||
name = 'prs500'
|
||||
|
||||
profile_map = {
|
||||
PRS500_PROFILE.name : PRS500_PROFILE,
|
||||
}
|
||||
|
||||
def profile_from_string(option, opt_str, value, parser):
|
||||
try:
|
||||
profile = profile_map[value]
|
||||
setattr(parser.values, option.dest, profile)
|
||||
except KeyError:
|
||||
raise OptionValueError('Profile: '+value+' is not implemented. Implemented profiles: %s'%(profile_map.keys()))
|
||||
|
||||
def option_parser(usage, gui_mode=False):
|
||||
parser = OptionParser(usage=usage, gui_mode=gui_mode)
|
||||
metadata = parser.add_option_group('METADATA OPTIONS')
|
||||
metadata.add_option("-t", "--title", action="store", type="string", default=None,\
|
||||
dest="title", help=_("Set the title. Default: filename."))
|
||||
metadata.add_option("-a", "--author", action="store", type="string", \
|
||||
dest="author", help=_("Set the author(s). Multiple authors should be set as a comma separated list. Default: %default"),
|
||||
default=_('Unknown'))
|
||||
metadata.add_option("--comment", action="store", type="string", \
|
||||
dest="freetext", help=_("Set the comment."), default=_('Unknown'))
|
||||
metadata.add_option("--category", action="store", type="string", \
|
||||
dest="category", help=_("Set the category"), default=_('Unknown'))
|
||||
metadata.add_option('--title-sort', action='store', default='', dest='title_sort',
|
||||
help=_('Sort key for the title'))
|
||||
metadata.add_option('--author-sort', action='store', default='', dest='author_sort',
|
||||
help=_('Sort key for the author'))
|
||||
metadata.add_option('--publisher', action='store', default=_('Unknown'), dest='publisher',
|
||||
help=_('Publisher'))
|
||||
metadata.add_option('--cover', action='store', dest='cover', default=None, \
|
||||
help=_('Path to file containing image to be used as cover'))
|
||||
metadata.add_option('--use-metadata-cover', action='store_true', default=False,
|
||||
help=_('If there is a cover graphic detected in the source file, use that instead of the specified cover.'))
|
||||
|
||||
parser.add_option('-o', '--output', action='store', default=None, \
|
||||
help=_('Output file name. Default is derived from input filename'))
|
||||
parser.add_option('--ignore-tables', action='store_true', default=False, dest='ignore_tables',
|
||||
help=_('Render HTML tables as blocks of text instead of actual tables. This is neccessary if the HTML contains very large or complex tables.'))
|
||||
laf = parser.add_option_group('LOOK AND FEEL')
|
||||
laf.add_option('--base-font-size', action='store', type='float', default=10.,
|
||||
help=_('''Specify the base font size in pts. All fonts are rescaled accordingly. This option obsoletes the --font-delta option and takes precedence over it. To use --font-delta, set this to 0. Default: %defaultpt'''))
|
||||
laf.add_option('--enable-autorotation', action='store_true', default=False,
|
||||
help=_('Enable autorotation of images that are wider than the screen width.'),
|
||||
dest='autorotation')
|
||||
laf.add_option('--wordspace', dest='wordspace', default=2.5, type='float',
|
||||
help=_('Set the space between words in pts. Default is %default'))
|
||||
laf.add_option('--blank-after-para', action='store_true', default=False,
|
||||
dest='blank_after_para', help=_('Separate paragraphs by blank lines.'))
|
||||
laf.add_option('--header', action='store_true', default=False, dest='header',
|
||||
help=_('Add a header to all the pages with title and author.'))
|
||||
laf.add_option('--headerformat', default="%t by %a", dest='headerformat', type='string',
|
||||
help=_('Set the format of the header. %a is replaced by the author and %t by the title. Default is %default'))
|
||||
laf.add_option('--header-separation', default=0, type='int',
|
||||
help=_('Add extra spacing below the header. Default is %default px.'))
|
||||
laf.add_option('--override-css', default=None, dest='_override_css', type='string',
|
||||
help=_('Override the CSS. Can be either a path to a CSS stylesheet or a string. If it is a string it is interpreted as CSS.'))
|
||||
laf.add_option('--use-spine', default=False, dest='use_spine', action='store_true',
|
||||
help=_('Use the <spine> element from the OPF file to determine the order in which the HTML files are appended to the LRF. The .opf file must be in the same directory as the base HTML file.'))
|
||||
laf.add_option('--minimum-indent', default=0, type='float',
|
||||
help=_('Minimum paragraph indent (the indent of the first line of a paragraph) in pts. Default: %default'))
|
||||
laf.add_option('--font-delta', action='store', type='float', default=0., \
|
||||
help=_("""Increase the font size by 2 * FONT_DELTA pts and """
|
||||
'''the line spacing by FONT_DELTA pts. FONT_DELTA can be a fraction.'''
|
||||
"""If FONT_DELTA is negative, the font size is decreased."""),
|
||||
dest='font_delta')
|
||||
laf.add_option('--ignore-colors', action='store_true', default=False, dest='ignore_colors',
|
||||
help=_('Render all content as black on white instead of the colors specified by the HTML or CSS.'))
|
||||
|
||||
page = parser.add_option_group('PAGE OPTIONS')
|
||||
profiles = profile_map.keys()
|
||||
page.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice',
|
||||
choices=profiles, action='callback', callback=profile_from_string,
|
||||
help=_('''Profile of the target device for which this LRF is '''
|
||||
'''being generated. The profile determines things like the '''
|
||||
'''resolution and screen size of the target device. '''
|
||||
'''Default: %s Supported profiles: ''')%(PRS500_PROFILE.name,)+\
|
||||
', '.join(profiles))
|
||||
page.add_option('--left-margin', default=20, dest='left_margin', type='int',
|
||||
help=_('''Left margin of page. Default is %default px.'''))
|
||||
page.add_option('--right-margin', default=20, dest='right_margin', type='int',
|
||||
help=_('''Right margin of page. Default is %default px.'''))
|
||||
page.add_option('--top-margin', default=10, dest='top_margin', type='int',
|
||||
help=_('''Top margin of page. Default is %default px.'''))
|
||||
page.add_option('--bottom-margin', default=0, dest='bottom_margin', type='int',
|
||||
help=_('''Bottom margin of page. Default is %default px.'''))
|
||||
page.add_option('--render-tables-as-images', default=False, action='store_true',
|
||||
help=_('Render tables in the HTML as images (useful if the document has large or complex tables)'))
|
||||
page.add_option('--text-size-multiplier-for-rendered-tables', type='float', default=1.0,
|
||||
help=_('Multiply the size of text in rendered tables by this factor. Default is %default'))
|
||||
|
||||
link = parser.add_option_group('LINK PROCESSING OPTIONS')
|
||||
link.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
|
||||
dest='link_levels',
|
||||
help=_(r'''The maximum number of levels to recursively process '''
|
||||
'''links. A value of 0 means thats links are not followed. '''
|
||||
'''A negative value means that <a> tags are ignored.'''))
|
||||
link.add_option('--link-exclude', dest='link_exclude', default='@',
|
||||
help=_('''A regular expression. <a> tags whose href '''
|
||||
'''matches will be ignored. Defaults to %default'''))
|
||||
link.add_option('--no-links-in-toc', action='store_true', default=False,
|
||||
dest='no_links_in_toc',
|
||||
help=_('''Don't add links to the table of contents.'''))
|
||||
chapter = parser.add_option_group('CHAPTER OPTIONS')
|
||||
chapter.add_option('--disable-chapter-detection', action='store_true',
|
||||
default=False, dest='disable_chapter_detection',
|
||||
help=_('''Prevent the automatic detection chapters.'''))
|
||||
chapter.add_option('--chapter-regex', dest='chapter_regex',
|
||||
default='chapter|book|appendix',
|
||||
help=_('''The regular expression used to detect chapter titles.'''
|
||||
''' It is searched for in heading tags (h1-h6). Defaults to %default'''))
|
||||
chapter.add_option('--chapter-attr', default='$,,$',
|
||||
help=_('Detect a chapter beginning at an element having the specified attribute. The format for this option is tagname regexp,attribute name,attribute value regexp. For example to match all heading tags that have the attribute class="chapter" you would use "h\d,class,chapter". You can set the attribute to "none" to match only on tag names. So for example, to match all h2 tags, you would use "h2,none,". Default is %default'''))
|
||||
chapter.add_option('--page-break-before-tag', dest='page_break', default='h[12]',
|
||||
help=_('''If html2lrf does not find any page breaks in the '''
|
||||
'''html file and cannot detect chapter headings, it will '''
|
||||
'''automatically insert page-breaks before the tags whose '''
|
||||
'''names match this regular expression. Defaults to %default. '''
|
||||
'''You can disable it by setting the regexp to "$". '''
|
||||
'''The purpose of this option is to try to ensure that '''
|
||||
'''there are no really long pages as this degrades the page '''
|
||||
'''turn performance of the LRF. Thus this option is ignored '''
|
||||
'''if the current page has only a few elements.'''))
|
||||
chapter.add_option('--force-page-break-before-tag', dest='force_page_break',
|
||||
default='$', help=_('Force a page break before tags whose names match this regular expression.'))
|
||||
chapter.add_option('--force-page-break-before-attr', dest='force_page_break_attr',
|
||||
default='$,,$', help=_('Force a page break before an element having the specified attribute. The format for this option is tagname regexp,attribute name,attribute value regexp. For example to match all heading tags that have the attribute class="chapter" you would use "h\d,class,chapter". Default is %default'''))
|
||||
chapter.add_option('--add-chapters-to-toc', action='store_true',
|
||||
default=False, dest='add_chapters_to_toc',
|
||||
help=_('''Add detected chapters to the table of contents.'''))
|
||||
prepro = parser.add_option_group('PREPROCESSING OPTIONS')
|
||||
prepro.add_option('--baen', action='store_true', default=False, dest='baen',
|
||||
help=_('''Preprocess Baen HTML files to improve generated LRF.'''))
|
||||
prepro.add_option('--pdftohtml', action='store_true', default=False, dest='pdftohtml',
|
||||
help=_('''You must add this option if processing files generated by pdftohtml, otherwise conversion will fail.'''))
|
||||
prepro.add_option('--book-designer', action='store_true', default=False, dest='book_designer',
|
||||
help=_('''Use this option on html0 files from Book Designer.'''))
|
||||
|
||||
fonts = parser.add_option_group('FONT FAMILIES',
|
||||
_('''Specify trutype font families for serif, sans-serif and monospace fonts. '''
|
||||
'''These fonts will be embedded in the LRF file. Note that custom fonts lead to '''
|
||||
'''slower page turns. '''
|
||||
'''For example: '''
|
||||
'''--serif-family "Times New Roman"
|
||||
'''))
|
||||
fonts.add_option('--serif-family',
|
||||
default=None, dest='serif_family', type='string',
|
||||
help=_('The serif family of fonts to embed'))
|
||||
fonts.add_option('--sans-family',
|
||||
default=None, dest='sans_family', type='string',
|
||||
help=_('The sans-serif family of fonts to embed'))
|
||||
fonts.add_option('--mono-family',
|
||||
default=None, dest='mono_family', type='string',
|
||||
help=_('The monospace family of fonts to embed'))
|
||||
|
||||
debug = parser.add_option_group('DEBUG OPTIONS')
|
||||
debug.add_option('--verbose', dest='verbose', action='store_true', default=False,
|
||||
help=_('''Be verbose while processing'''))
|
||||
debug.add_option('--lrs', action='store_true', dest='lrs', \
|
||||
help=_('Convert to LRS'), default=False)
|
||||
parser.add_option('--minimize-memory-usage', action='store_true', default=False,
|
||||
help=_('Minimize memory usage at the cost of longer processing times. Use this option if you are on a memory constrained machine.'))
|
||||
parser.add_option('--encoding', default=None,
|
||||
help=_('Specify the character encoding of the source file. If the output LRF file contains strange characters, try changing this option. A common encoding for files from windows computers is cp-1252. Another common choice is utf-8. The default is to try and guess the encoding.'))
|
||||
|
||||
return parser
|
||||
|
||||
name = 'prs500'
|
||||
|
||||
def find_custom_fonts(options, logger):
|
||||
from calibre.utils.fontconfig import files_for_family
|
||||
@ -238,16 +48,16 @@ def find_custom_fonts(options, logger):
|
||||
f = family(options.sans_family)
|
||||
fonts['sans'] = files_for_family(f)
|
||||
if not fonts['sans']:
|
||||
logger.warn('Unable to find sans family %s'%f)
|
||||
logger.warn('Unable to find sans family %s'%f)
|
||||
if options.mono_family:
|
||||
f = family(options.mono_family)
|
||||
fonts['mono'] = files_for_family(f)
|
||||
if not fonts['mono']:
|
||||
logger.warn('Unable to find mono family %s'%f)
|
||||
logger.warn('Unable to find mono family %s'%f)
|
||||
return fonts
|
||||
|
||||
|
||||
def Book(options, logger, font_delta=0, header=None,
|
||||
|
||||
|
||||
def Book(options, logger, font_delta=0, header=None,
|
||||
profile=PRS500_PROFILE, **settings):
|
||||
ps = {}
|
||||
ps['topmargin'] = options.top_margin
|
||||
@ -258,7 +68,7 @@ def Book(options, logger, font_delta=0, header=None,
|
||||
- profile.fudge
|
||||
if header:
|
||||
hdr = Header()
|
||||
hb = TextBlock(textStyle=TextStyle(align='foot',
|
||||
hb = TextBlock(textStyle=TextStyle(align='foot',
|
||||
fontsize=int(profile.header_font_size*10)),
|
||||
blockStyle=BlockStyle(blockwidth=ps['textwidth']))
|
||||
hb.append(header)
|
||||
@ -269,20 +79,20 @@ def Book(options, logger, font_delta=0, header=None,
|
||||
ps['topmargin'] = 0
|
||||
ps['textheight'] = profile.screen_height - (options.bottom_margin + ps['topmargin']) \
|
||||
- ps['headheight'] - ps['headsep'] - profile.fudge
|
||||
|
||||
|
||||
fontsize = int(10*profile.font_size+font_delta*20)
|
||||
baselineskip = fontsize + 20
|
||||
fonts = find_custom_fonts(options, logger)
|
||||
tsd = dict(fontsize=fontsize,
|
||||
parindent=int(10*profile.parindent),
|
||||
tsd = dict(fontsize=fontsize,
|
||||
parindent=int(10*profile.parindent),
|
||||
linespace=int(10*profile.line_space),
|
||||
baselineskip=baselineskip,
|
||||
wordspace=10*options.wordspace)
|
||||
if fonts['serif'] and fonts['serif'].has_key('normal'):
|
||||
tsd['fontfacename'] = fonts['serif']['normal'][1]
|
||||
|
||||
book = _Book(textstyledefault=tsd,
|
||||
pagestyledefault=ps,
|
||||
|
||||
book = _Book(textstyledefault=tsd,
|
||||
pagestyledefault=ps,
|
||||
blockstyledefault=dict(blockwidth=ps['textwidth']),
|
||||
bookid=uuid4().hex,
|
||||
**settings)
|
||||
@ -291,7 +101,7 @@ def Book(options, logger, font_delta=0, header=None,
|
||||
for font in fonts[family].values():
|
||||
book.embed_font(*font)
|
||||
FONT_FILE_MAP[font[1]] = font[0]
|
||||
|
||||
|
||||
for family in ['serif', 'sans', 'mono']:
|
||||
if not fonts[family]:
|
||||
fonts[family] = { 'normal' : (None, profile.default_fonts[family]) }
|
||||
@ -299,4 +109,3 @@ def Book(options, logger, font_delta=0, header=None,
|
||||
raise ConversionError, 'Could not find the normal version of the ' + family + ' font'
|
||||
return book, fonts
|
||||
|
||||
from calibre import entity_to_unicode
|
||||
|
@ -1,2 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
@ -1,199 +0,0 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''Convert any ebook file into a LRF file.'''
|
||||
|
||||
import sys, os, logging, shutil, tempfile, re
|
||||
|
||||
from calibre.ebooks import UnknownFormatError
|
||||
from calibre.ebooks.lrf import option_parser as _option_parser
|
||||
from calibre import __appname__, setup_cli_handlers, extract
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.ebooks.lrf.lit.convert_from import process_file as lit2lrf
|
||||
from calibre.ebooks.lrf.pdf.convert_from import process_file as pdf2lrf
|
||||
from calibre.ebooks.lrf.rtf.convert_from import process_file as rtf2lrf
|
||||
from calibre.ebooks.lrf.txt.convert_from import process_file as txt2lrf
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html2lrf
|
||||
from calibre.ebooks.lrf.epub.convert_from import process_file as epub2lrf
|
||||
from calibre.ebooks.lrf.mobi.convert_from import process_file as mobi2lrf
|
||||
from calibre.ebooks.lrf.fb2.convert_from import process_file as fb22lrf
|
||||
|
||||
from calibre.customize.ui import run_plugins_on_postprocess, run_plugins_on_preprocess
|
||||
|
||||
def largest_file(files):
|
||||
maxsize, file = 0, None
|
||||
for f in files:
|
||||
size = os.stat(f).st_size
|
||||
if size > maxsize:
|
||||
maxsize = size
|
||||
file = f
|
||||
return file
|
||||
|
||||
def find_htmlfile(dir):
|
||||
ext_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE)
|
||||
toc_pat = re.compile(r'toc', re.IGNORECASE)
|
||||
index_pat = re.compile(r'index', re.IGNORECASE)
|
||||
toc_files, index_files, files = [], [], []
|
||||
|
||||
for root, dirs, _files in os.walk(dir):
|
||||
for f in _files:
|
||||
f = os.path.abspath(os.path.join(root, f))
|
||||
ext = os.path.splitext(f)[1]
|
||||
if ext and ext_pat.match(ext):
|
||||
toc_files.append(f) if toc_pat.search(f) else \
|
||||
index_files.append(f) if index_pat.search(f) else \
|
||||
files.append(f)
|
||||
a = toc_files if toc_files else index_files if index_files else files
|
||||
if a:
|
||||
return largest_file(a)
|
||||
|
||||
def number_of_unhidden_files(base, listing):
|
||||
ans = 0
|
||||
for i in listing:
|
||||
i = os.path.join(base, i)
|
||||
if os.path.isdir(i) or os.path.basename(i).startswith('.'):
|
||||
continue
|
||||
ans += 1
|
||||
return ans
|
||||
|
||||
def unhidden_directories(base, listing):
|
||||
ans = []
|
||||
for i in listing:
|
||||
if os.path.isdir(os.path.join(base, i)) and not i.startswith('__') and \
|
||||
not i.startswith('.'):
|
||||
ans.append(i)
|
||||
return ans
|
||||
|
||||
def traverse_subdirs(tdir):
|
||||
temp = os.listdir(tdir)
|
||||
if number_of_unhidden_files(tdir, temp) == 0:
|
||||
try:
|
||||
cdir = os.path.join(tdir, unhidden_directories(tdir, temp)[0])
|
||||
return traverse_subdirs(cdir)
|
||||
except IndexError:
|
||||
pass
|
||||
return tdir
|
||||
|
||||
def handle_archive(path):
|
||||
tdir = tempfile.mkdtemp(prefix=__appname__+'_'+'archive_')
|
||||
extract(path, tdir)
|
||||
files = []
|
||||
cdir = traverse_subdirs(tdir)
|
||||
file = None
|
||||
exts = ['lit', 'rtf', 'fb2','pdf', 'txt', 'epub', 'mobi', 'prc']
|
||||
candidates = map(lambda x:os.path.join(cdir, x), os.listdir(cdir))
|
||||
for ext in exts:
|
||||
for f in candidates:
|
||||
if f.lower().endswith('.'+ext):
|
||||
files.append(f)
|
||||
file = largest_file(files)
|
||||
if not file:
|
||||
file = find_htmlfile(cdir)
|
||||
if isinstance(file, str):
|
||||
file = file.decode(sys.getfilesystemencoding())
|
||||
return tdir, file
|
||||
|
||||
def odt2lrf(path, options, logger):
|
||||
from calibre.ebooks.odt.to_oeb import Extract
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('odt2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
|
||||
with TemporaryDirectory('_odt2lrf') as tdir:
|
||||
opf = Extract()(path, tdir)
|
||||
options.use_spine = True
|
||||
options.encoding = 'utf-8'
|
||||
html_process_file(opf.replace('metadata.opf', 'index.html'), options, logger)
|
||||
|
||||
def process_file(path, options, logger=None):
|
||||
path = os.path.abspath(os.path.expanduser(path))
|
||||
path = run_plugins_on_preprocess(path)
|
||||
tdir = None
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('any2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
if not os.access(path, os.R_OK):
|
||||
logger.critical('Cannot read from %s', path)
|
||||
return 1
|
||||
ext = os.path.splitext(path)[1]
|
||||
if not ext or ext == '.':
|
||||
logger.critical('Unknown file type: %s', path)
|
||||
return 1
|
||||
ext = ext[1:].lower()
|
||||
cwd = os.getcwd()
|
||||
if not options.output:
|
||||
fmt = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.splitext(os.path.basename(path))[0] + fmt
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
if ext in ['zip', 'rar', 'oebzip']:
|
||||
newpath = None
|
||||
try:
|
||||
tdir, newpath = handle_archive(path)
|
||||
except:
|
||||
logger.exception(' ')
|
||||
if not newpath:
|
||||
raise UnknownFormatError('Could not find ebook in archive')
|
||||
path = newpath
|
||||
logger.info('Found ebook in archive: %s', repr(path))
|
||||
try:
|
||||
ext = os.path.splitext(path)[1][1:].lower()
|
||||
convertor = None
|
||||
if 'htm' in ext:
|
||||
convertor = html2lrf
|
||||
elif 'lit' == ext:
|
||||
convertor = lit2lrf
|
||||
elif 'pdf' == ext:
|
||||
convertor = pdf2lrf
|
||||
elif 'rtf' == ext:
|
||||
convertor = rtf2lrf
|
||||
elif 'txt' == ext:
|
||||
convertor = txt2lrf
|
||||
elif 'epub' == ext:
|
||||
convertor = epub2lrf
|
||||
elif ext in ['mobi', 'prc', 'azw']:
|
||||
convertor = mobi2lrf
|
||||
elif ext == 'fb2':
|
||||
convertor = fb22lrf
|
||||
elif ext == 'odt':
|
||||
convertor = odt2lrf
|
||||
if not convertor:
|
||||
raise UnknownFormatError(_('Converting from %s to LRF is not supported.')%ext)
|
||||
convertor(path, options, logger)
|
||||
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
if tdir and os.path.exists(tdir):
|
||||
shutil.rmtree(tdir)
|
||||
return 0
|
||||
|
||||
|
||||
def option_parser(gui_mode=False):
|
||||
return _option_parser(usage=_('''\
|
||||
any2lrf [options] myfile
|
||||
|
||||
Convert any ebook format into LRF. Supported formats are:
|
||||
LIT, RTF, TXT, HTML, EPUB, MOBI, PRC and PDF. any2lrf will also process a RAR or
|
||||
ZIP archive, looking for an ebook inside the archive.
|
||||
'''), gui_mode=gui_mode)
|
||||
|
||||
|
||||
def main(args=sys.argv, logger=None, gui_mode=False):
|
||||
parser = option_parser(gui_mode)
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print _('No file to convert specified.')
|
||||
return 1
|
||||
|
||||
src = args[1]
|
||||
if not isinstance(src, unicode):
|
||||
src = src.decode(sys.getfilesystemencoding())
|
||||
return process_file(src, options, logger)
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -1,3 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
@ -1,75 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os, sys, shutil, logging
|
||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from calibre.ebooks import ConversionError, DRMError
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from calibre.ebooks.metadata.opf import OPF
|
||||
from calibre.ebooks.metadata.epub import OCFDirReader
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre import setup_cli_handlers
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
|
||||
|
||||
def option_parser():
|
||||
return lrf_option_parser(
|
||||
_('''Usage: %prog [options] mybook.epub
|
||||
|
||||
|
||||
%prog converts mybook.epub to mybook.lrf''')
|
||||
)
|
||||
|
||||
def generate_html(pathtoepub, logger):
|
||||
if not os.access(pathtoepub, os.R_OK):
|
||||
raise ConversionError('Cannot read from ' + pathtoepub)
|
||||
tdir = PersistentTemporaryDirectory('_epub2lrf')
|
||||
#os.rmdir(tdir)
|
||||
try:
|
||||
ZipFile(pathtoepub).extractall(tdir)
|
||||
except:
|
||||
raise ConversionError, '.epub extraction failed'
|
||||
if os.path.exists(os.path.join(tdir, 'META-INF', 'encryption.xml')):
|
||||
raise DRMError(os.path.basename(pathtoepub))
|
||||
|
||||
return tdir
|
||||
|
||||
def process_file(path, options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('epub2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
epub = os.path.abspath(os.path.expanduser(path))
|
||||
tdir = generate_html(epub, logger)
|
||||
try:
|
||||
ocf = OCFDirReader(tdir)
|
||||
htmlfile = ocf.opf.spine[0].path
|
||||
options.opf = os.path.join(tdir, ocf.container[OPF.MIMETYPE])
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
options.use_spine = True
|
||||
|
||||
html_process_file(htmlfile, options, logger=logger)
|
||||
finally:
|
||||
try:
|
||||
shutil.rmtree(tdir)
|
||||
except:
|
||||
logger.warning('Failed to delete temporary directory '+tdir)
|
||||
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print 'No epub file specified'
|
||||
return 1
|
||||
process_file(args[1], options, logger)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -1,4 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
@ -1,59 +0,0 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Convert web feeds to LRF files.
|
||||
'''
|
||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file
|
||||
from calibre.web.feeds.main import option_parser as feeds_option_parser
|
||||
from calibre.web.feeds.main import run_recipe
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre import sanitize_file_name, strftime
|
||||
|
||||
import sys, os
|
||||
|
||||
def option_parser():
|
||||
parser = feeds_option_parser()
|
||||
parser.remove_option('--output-dir')
|
||||
parser.remove_option('--lrf')
|
||||
parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk'))
|
||||
lrf_parser = lrf_option_parser('')
|
||||
lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf'))
|
||||
parser.merge(lrf_parser)
|
||||
return parser
|
||||
|
||||
def main(args=sys.argv, notification=None, handler=None):
|
||||
parser = option_parser()
|
||||
opts, args = parser.parse_args(args)
|
||||
opts.lrf = True
|
||||
|
||||
if len(args) != 2 and opts.feeds is None:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
recipe_arg = args[1] if len(args) > 1 else None
|
||||
|
||||
with TemporaryDirectory('_feeds2lrf') as tdir:
|
||||
opts.output_dir = tdir
|
||||
|
||||
recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler)
|
||||
|
||||
htmlfile = os.path.join(tdir, 'index.html')
|
||||
if not os.access(htmlfile, os.R_OK):
|
||||
raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg)
|
||||
|
||||
lparser = lrf_option_parser('')
|
||||
ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0]
|
||||
parser.merge_options(ropts, opts)
|
||||
|
||||
if not opts.output:
|
||||
ext = '.lrs' if opts.lrs else '.lrf'
|
||||
fname = recipe.title + strftime(recipe.timefmt)+ext
|
||||
opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname))
|
||||
print 'Generating LRF...'
|
||||
process_file(htmlfile, opts)
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
@ -1,90 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os, sys, shutil, glob, logging
|
||||
from tempfile import mkdtemp
|
||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from calibre.ebooks.lit.reader import LitReader
|
||||
from calibre.ebooks import ConversionError
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from calibre.ebooks.metadata.opf import OPFReader
|
||||
from calibre import __appname__, setup_cli_handlers
|
||||
|
||||
def option_parser():
|
||||
parser = lrf_option_parser(
|
||||
_('''Usage: %prog [options] mybook.lit
|
||||
|
||||
|
||||
%prog converts mybook.lit to mybook.lrf''')
|
||||
)
|
||||
return parser
|
||||
|
||||
def generate_html(pathtolit, logger):
|
||||
if not os.access(pathtolit, os.R_OK):
|
||||
raise ConversionError, 'Cannot read from ' + pathtolit
|
||||
tdir = mkdtemp(prefix=__appname__+'_'+'lit2oeb_')
|
||||
lr = LitReader(pathtolit)
|
||||
print 'Extracting LIT file to', tdir
|
||||
lr.extract_content(tdir)
|
||||
return tdir
|
||||
|
||||
def process_file(path, options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('lit2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
lit = os.path.abspath(os.path.expanduser(path))
|
||||
tdir = generate_html(lit, logger)
|
||||
try:
|
||||
opf = glob.glob(os.path.join(tdir, '*.opf'))
|
||||
if opf:
|
||||
path = opf[0]
|
||||
opf = OPFReader(path)
|
||||
htmlfile = opf.spine[0].path.replace('&', '%26') #convertlit replaces & with %26
|
||||
options.opf = path
|
||||
else:
|
||||
l = glob.glob(os.path.join(tdir, '*toc*.htm*'))
|
||||
if not l:
|
||||
l = glob.glob(os.path.join(tdir, '*top*.htm*'))
|
||||
if not l:
|
||||
l = glob.glob(os.path.join(tdir, '*contents*.htm*'))
|
||||
if not l:
|
||||
l = glob.glob(os.path.join(tdir, '*.htm*'))
|
||||
if not l:
|
||||
l = glob.glob(os.path.join(tdir, '*.txt*')) # Some lit file apparently have .txt files in them
|
||||
if not l:
|
||||
raise ConversionError('Conversion of lit to html failed. Cannot find html file.')
|
||||
maxsize, htmlfile = 0, None
|
||||
for c in l:
|
||||
sz = os.path.getsize(c)
|
||||
if sz > maxsize:
|
||||
maxsize, htmlfile = sz, c
|
||||
else:
|
||||
htmlfile = l[0]
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
options.use_spine = True
|
||||
html_process_file(htmlfile, options, logger=logger)
|
||||
finally:
|
||||
try:
|
||||
shutil.rmtree(tdir)
|
||||
except:
|
||||
logger.warning('Failed to delete temporary directory '+tdir)
|
||||
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print 'No lit file specified'
|
||||
return 1
|
||||
process_file(args[1], options, logger)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -1,63 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
''''''
|
||||
|
||||
import sys, tempfile, os, logging, shutil
|
||||
|
||||
from calibre import setup_cli_handlers, __appname__
|
||||
from calibre.ebooks.mobi.reader import MobiReader
|
||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
|
||||
def generate_html(mobifile, tdir):
|
||||
mr = MobiReader(mobifile)
|
||||
mr.extract_content(tdir)
|
||||
return mr.htmlfile
|
||||
|
||||
def process_file(path, options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('lit2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
mobi = os.path.abspath(os.path.expanduser(path))
|
||||
tdir = tempfile.mkdtemp('mobi2lrf', __appname__)
|
||||
try:
|
||||
htmlfile = generate_html(mobi, tdir)
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
options.use_spine = True
|
||||
html_process_file(htmlfile, options, logger=logger)
|
||||
finally:
|
||||
try:
|
||||
shutil.rmtree(tdir)
|
||||
except:
|
||||
logger.warning('Failed to delete temporary directory '+tdir)
|
||||
|
||||
def option_parser():
|
||||
return lrf_option_parser(
|
||||
_('''Usage: %prog [options] mybook.mobi|prc
|
||||
|
||||
|
||||
%prog converts mybook.mobi to mybook.lrf''')
|
||||
)
|
||||
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print 'No mobi file specified'
|
||||
return 1
|
||||
process_file(args[1], options, logger)
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -2,7 +2,8 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
import struct, array, zlib, cStringIO, collections, re
|
||||
|
||||
from calibre.ebooks.lrf import LRFParseError, PRS500_PROFILE, entity_to_unicode
|
||||
from calibre.ebooks.lrf import LRFParseError, PRS500_PROFILE
|
||||
from calibre import entity_to_unicode
|
||||
from calibre.ebooks.lrf.tags import Tag
|
||||
|
||||
ruby_tags = {
|
||||
|
135
src/calibre/ebooks/lrf/output.py
Normal file
135
src/calibre/ebooks/lrf/output.py
Normal file
@ -0,0 +1,135 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
|
||||
class LRFOptions(object):
|
||||
|
||||
def __init__(self, output, opts, oeb):
|
||||
def f2s(f):
|
||||
try:
|
||||
return unicode(f[0])
|
||||
except:
|
||||
return ''
|
||||
m = oeb.metadata
|
||||
self.title = None
|
||||
self.author = self.publisher = _('Unknown')
|
||||
self.freetext = f2s(m.description)
|
||||
self.category = f2s(m.tags)
|
||||
self.title_sort = self.author_sort = ''
|
||||
self.cover = None
|
||||
self.use_metadata_cover = True
|
||||
self.output = output
|
||||
self.ignore_tables = opts.linearize_tables
|
||||
self.base_font_size = 0
|
||||
self.blank_after_para = opts.insert_blank_line
|
||||
self.use_spine = True
|
||||
self.font_delta = 0
|
||||
self.ignore_colors = False
|
||||
from calibre.ebooks.lrf import PRS500_PROFILE
|
||||
self.profile = PRS500_PROFILE
|
||||
self.link_levels = sys.maxint
|
||||
self.link_exclude = '@'
|
||||
self.no_links_in_toc = True
|
||||
self.disable_chapter_detection = True
|
||||
self.chapter_regex = 'dsadcdswcdec'
|
||||
self.chapter_attr = '$,,$'
|
||||
self.override_css = self._override_css = ''
|
||||
self.page_break = 'h[12]'
|
||||
self.force_page_break = '$'
|
||||
self.force_page_break_attr = '$'
|
||||
self.add_chapters_to_toc = False
|
||||
self.baen = self.pdftohtml = self.book_designer = False
|
||||
self.verbose = opts.verbose
|
||||
self.encoding = 'utf-8'
|
||||
self.lrs = False
|
||||
self.minimize_memory_usage = False
|
||||
self.autorotation = opts.enable_autorotation
|
||||
|
||||
|
||||
for x in ('top', 'bottom', 'left', 'right'):
|
||||
setattr(self, x+'_margin', (self.profile.dpi/72.) * getattr(opts,
|
||||
'margin_'+x))
|
||||
|
||||
for x in ('wordspace', 'header', 'header_format',
|
||||
'header_separation', 'minimum_indent', 'serif_family',
|
||||
'render_tables_as_images', 'sans_family', 'mono_family',
|
||||
'text_size_multiplier_for_rendered_tables'):
|
||||
setattr(self, x, getattr(opts, x))
|
||||
|
||||
class LRFOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'LRF Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'lrf'
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='enable_autorotation', recommended_value=False,
|
||||
help=_('Enable autorotation of images that are wider than the screen width.')
|
||||
),
|
||||
OptionRecommendation(name='wordspace',
|
||||
recommended_value=2.5, level=OptionRecommendation.LOW,
|
||||
help=_('Set the space between words in pts. Default is %default')
|
||||
),
|
||||
OptionRecommendation(name='header', recommended_value=False,
|
||||
help=_('Add a header to all the pages with title and author.')
|
||||
),
|
||||
OptionRecommendation(name='header_format', recommended_value="%t by %a",
|
||||
help=_('Set the format of the header. %a is replaced by the author '
|
||||
'and %t by the title. Default is %default')
|
||||
),
|
||||
OptionRecommendation(name='header_separation', recommended_value=0,
|
||||
help=_('Add extra spacing below the header. Default is %default px.')
|
||||
),
|
||||
OptionRecommendation(name='minimum_indent', recommended_value=0,
|
||||
help=_('Minimum paragraph indent (the indent of the first line '
|
||||
'of a paragraph) in pts. Default: %default')
|
||||
),
|
||||
OptionRecommendation(name='render_tables_as_images',
|
||||
recommended_value=False,
|
||||
help=_('Render tables in the HTML as images (useful if the '
|
||||
'document has large or complex tables)')
|
||||
),
|
||||
OptionRecommendation(name='text_size_multiplier_for_rendered_tables',
|
||||
recommended_value=1,
|
||||
help=_('Multiply the size of text in rendered tables by this '
|
||||
'factor. Default is %default')
|
||||
),
|
||||
OptionRecommendation(name='serif_family', recommended_value=None,
|
||||
help=_('The serif family of fonts to embed')
|
||||
),
|
||||
OptionRecommendation(name='sans_family', recommended_value=None,
|
||||
help=_('The sans-serif family of fonts to embed')
|
||||
),
|
||||
OptionRecommendation(name='mono_family', recommended_value=None,
|
||||
help=_('The monospace family of fonts to embed')
|
||||
),
|
||||
|
||||
])
|
||||
|
||||
recommendations = set([
|
||||
('dont_justify', True, OptionRecommendation.HIGH),
|
||||
])
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
|
||||
lrf_opts = LRFOptions(output_path, opts, oeb)
|
||||
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
with TemporaryDirectory('_lrf_output') as tdir:
|
||||
from calibre.customize.ui import plugin_for_output_format
|
||||
oeb_output = plugin_for_output_format('oeb')
|
||||
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
|
||||
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file
|
||||
process_file(os.path.join(tdir, opf), lrf_opts, self.log)
|
||||
|
@ -1,2 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
@ -1,131 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
''''''
|
||||
|
||||
import sys, os, subprocess, logging
|
||||
import errno
|
||||
from functools import partial
|
||||
from calibre import isosx, setup_cli_handlers, filename_to_utf8, iswindows, islinux
|
||||
from calibre.ebooks import ConversionError, DRMError
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf import OPFCreator
|
||||
from calibre.ebooks.metadata.pdf import get_metadata
|
||||
|
||||
PDFTOHTML = 'pdftohtml'
|
||||
popen = subprocess.Popen
|
||||
if isosx and hasattr(sys, 'frameworks_dir'):
|
||||
PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML)
|
||||
if iswindows and hasattr(sys, 'frozen'):
|
||||
PDFTOHTML = os.path.join(os.path.dirname(sys.executable), 'pdftohtml.exe')
|
||||
popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up
|
||||
if islinux and getattr(sys, 'frozen_path', False):
|
||||
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
|
||||
|
||||
def generate_html(pathtopdf, tdir):
|
||||
'''
|
||||
Convert the pdf into html.
|
||||
@return: Path to a temporary file containing the HTML.
|
||||
'''
|
||||
if isinstance(pathtopdf, unicode):
|
||||
pathtopdf = pathtopdf.encode(sys.getfilesystemencoding())
|
||||
if not os.access(pathtopdf, os.R_OK):
|
||||
raise ConversionError, 'Cannot read from ' + pathtopdf
|
||||
index = os.path.join(tdir, 'index.html')
|
||||
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
|
||||
pathtopdf = os.path.abspath(pathtopdf)
|
||||
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
|
||||
'-nodrm', pathtopdf, os.path.basename(index))
|
||||
cwd = os.getcwd()
|
||||
|
||||
try:
|
||||
os.chdir(tdir)
|
||||
try:
|
||||
p = popen(cmd, stderr=subprocess.PIPE)
|
||||
except OSError, err:
|
||||
if err.errno == 2:
|
||||
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True)
|
||||
else:
|
||||
raise
|
||||
|
||||
'''
|
||||
print p.stdout.read()
|
||||
'''
|
||||
while True:
|
||||
try:
|
||||
ret = p.wait()
|
||||
break
|
||||
except OSError, e:
|
||||
if e.errno == errno.EINTR:
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
|
||||
if ret != 0:
|
||||
err = p.stderr.read()
|
||||
raise ConversionError, err
|
||||
if not os.path.exists(index) or os.stat(index).st_size < 100:
|
||||
raise DRMError()
|
||||
|
||||
raw = open(index, 'rb').read()
|
||||
open(index, 'wb').write('<!-- created by calibre\'s pdftohtml -->\n'+raw)
|
||||
if not '<br' in raw[:4000]:
|
||||
raise ConversionError(os.path.basename(pathtopdf) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True)
|
||||
try:
|
||||
mi = get_metadata(open(pathtopdf, 'rb'))
|
||||
except:
|
||||
mi = MetaInformation(None, None)
|
||||
if not mi.title:
|
||||
mi.title = os.path.splitext(os.path.basename(pathtopdf))[0]
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
opf = OPFCreator(tdir, mi)
|
||||
opf.create_manifest([('index.html', None)])
|
||||
opf.create_spine(['index.html'])
|
||||
opf.render(open('metadata.opf', 'wb'))
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
return index
|
||||
|
||||
def option_parser():
|
||||
return lrf_option_parser(
|
||||
_('''%prog [options] mybook.pdf
|
||||
|
||||
|
||||
%prog converts mybook.pdf to mybook.lrf''')
|
||||
)
|
||||
|
||||
def process_file(path, options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('pdf2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
pdf = os.path.abspath(os.path.expanduser(path))
|
||||
tdir = PersistentTemporaryDirectory('_pdf2lrf')
|
||||
htmlfile = generate_html(pdf, tdir)
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
|
||||
else:
|
||||
options.output = os.path.abspath(options.output)
|
||||
options.pdftohtml = True
|
||||
if not options.title:
|
||||
options.title = filename_to_utf8(os.path.splitext(os.path.basename(options.output))[0])
|
||||
html_process_file(htmlfile, options, logger)
|
||||
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print 'No pdf file specified'
|
||||
return 1
|
||||
process_file(args[1], options, logger)
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -1,426 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Convert PDF to a reflowable format using pdftoxml.exe as the PDF parsing backend.
|
||||
'''
|
||||
|
||||
import sys, os, re, tempfile, subprocess, atexit, shutil, logging, xml.parsers.expat
|
||||
from xml.etree.ElementTree import parse
|
||||
|
||||
from calibre import isosx, setup_cli_handlers, __appname__
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.ebooks import ConversionError
|
||||
|
||||
PDFTOXML = 'pdftoxml.exe'
|
||||
if isosx and hasattr(sys, 'frameworks_dir'):
|
||||
PDFTOXML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOXML)
|
||||
|
||||
class StyleContainer(object):
|
||||
|
||||
def set_style(self, iterator):
|
||||
styles = set([])
|
||||
for tok in iterator:
|
||||
if hasattr(tok, 'style'):
|
||||
styles.add(tok.style)
|
||||
counts = [0*i for i in range(len(styles))]
|
||||
for i in range(len(styles)):
|
||||
counts[i] = sum([1 for j in self if j.style == styles[i]])
|
||||
max = max(counts)
|
||||
for i in range(len(counts)):
|
||||
if counts[i] == max:
|
||||
break
|
||||
self.style = styles[i]
|
||||
for obj in iterator:
|
||||
if obj.style == self.style:
|
||||
obj.style = None
|
||||
|
||||
|
||||
class Page(object):
|
||||
|
||||
def __init__(self, attrs):
|
||||
for a in ('number', 'width', 'height'):
|
||||
setattr(self, a, float(attrs[a]))
|
||||
self.id = attrs['id']
|
||||
self.current_line = None
|
||||
self.lines = []
|
||||
|
||||
def end_line(self):
|
||||
if self.current_line is not None:
|
||||
self.current_line.finalize()
|
||||
self.lines.append(self.current_line)
|
||||
self.current_line = None
|
||||
|
||||
def finalize(self):
|
||||
self.identify_groups()
|
||||
self.look_for_page_break()
|
||||
|
||||
def identify_groups(self):
|
||||
groups = []
|
||||
in_group = False
|
||||
for i in range(len(self.lines)):
|
||||
if not in_group:
|
||||
groups.append(i)
|
||||
in_group = True
|
||||
else:
|
||||
pl = self.lines[i-1]
|
||||
cl = self.lines[i]
|
||||
if cl.left != pl.left and cl.width != pl.width:
|
||||
groups.append(i)
|
||||
self.groups = []
|
||||
for i in range(len(groups)):
|
||||
start = groups[i]
|
||||
if i +1 == len(groups):
|
||||
stop = len(self.lines)
|
||||
else:
|
||||
stop = groups[i+i]
|
||||
self.groups.append(self.lines[start:stop])
|
||||
|
||||
if len(self.groups) > 1:
|
||||
self.group[0].test_header(self.width, self.height)
|
||||
self.groups[-1].test_footer(self.width, self.height)
|
||||
|
||||
def look_for_page_break(self):
|
||||
max = 0
|
||||
for g in self.groups:
|
||||
if not g.is_footer and g.bottom > max:
|
||||
max = g.bottom
|
||||
self.page_break_after = max < 0.8*self.height
|
||||
|
||||
|
||||
class Group(StyleContainer):
|
||||
|
||||
def __init__(self, lines):
|
||||
self.lines = lines
|
||||
self.set_style(self.lines)
|
||||
self.width = max([i.width for i in self.lines])
|
||||
self.bottom = max([i.bottom for i in self.lines])
|
||||
tot, ltot = 0, 0
|
||||
for i in range(1, len(self.lines)):
|
||||
bot = self.lines[i-1].bottom
|
||||
top = self.lines[i].top
|
||||
tot += abs(top - bot)
|
||||
ltot += self.lines[i].left
|
||||
self.average_line_spacing = tot/float(len(self.lines)-1)
|
||||
ltot += self.lines[0].left
|
||||
self.average_left_margin = ltot/float(len(self.lines))
|
||||
self.left_margin = min([i.left for i in self.lines])
|
||||
|
||||
self.detect_paragraphs()
|
||||
|
||||
|
||||
|
||||
def detect_paragraphs(self):
|
||||
if not self.lines:
|
||||
return
|
||||
indent_buffer = 5
|
||||
self.lines[0].is_para_start = self.lines[0].left > self.average_left_margin+indent_buffer
|
||||
for i in range(1, len(self.lines)):
|
||||
pl, l = self.lines[i-1:i+1]
|
||||
c1 = pl.bottom - l.top > self.average_line_spacing
|
||||
c2 = l.left > self.average_left_margin+indent_buffer
|
||||
c3 = pl.width < 0.8 * self.width
|
||||
l.is_para_start = c1 or c2 or c3
|
||||
|
||||
def test_header(self, page_width, page_height):
|
||||
self.is_header = len(self.lines) == 1 and self.lines[0].width < 0.5*page_width
|
||||
|
||||
def test_footer(self, page_width, page_height):
|
||||
self.is_footer = len(self.lines) == 1 and self.lines[0].width < 0.5*page_width
|
||||
|
||||
class Text(object):
|
||||
|
||||
def __init__(self, attrs):
|
||||
for a in ('x', 'y', 'width', 'height'):
|
||||
setattr(self, a, float(attrs[a]))
|
||||
self.id = attrs['id']
|
||||
self.objects = []
|
||||
|
||||
def add_token(self, tok):
|
||||
if not self.objects:
|
||||
self.objects.append(tok)
|
||||
else:
|
||||
ptok = self.objects[-1]
|
||||
if tok == ptok:
|
||||
ptok.text += ' ' + tok.text
|
||||
else:
|
||||
self.objects.append(tok)
|
||||
|
||||
def add(self, object):
|
||||
if isinstance(object, Token):
|
||||
self.add_token(object)
|
||||
else:
|
||||
print 'WARNING: Unhandled object', object.__class__.__name__
|
||||
|
||||
def to_xhtml(self):
|
||||
res = []
|
||||
for obj in self.objects:
|
||||
if isinstance(obj, Token):
|
||||
res.append(obj.to_xhtml())
|
||||
return ' '.join(res)
|
||||
|
||||
|
||||
class Line(list, StyleContainer):
|
||||
|
||||
def calculate_geometry(self):
|
||||
self.left = self[0].x
|
||||
self.width = self[-1].x + self[-1].width - self.left
|
||||
self.top = min(o.y for o in self)
|
||||
self.bottom = max(o.height+o.y for o in self)
|
||||
|
||||
def finalize(self):
|
||||
self.calculate_geometry()
|
||||
self.set_style(self)
|
||||
|
||||
def to_xhtml(self, group_id):
|
||||
ans = '<span class="%s" '%group_id
|
||||
if self.style is not None:
|
||||
ans += 'style="%s"'%self.style.to_css(inline=True)
|
||||
ans += '>%s</span>'
|
||||
res = []
|
||||
for object in self:
|
||||
if isinstance(object, Text):
|
||||
res.append(object.to_xhtml())
|
||||
|
||||
return ans%(' '.join(res))
|
||||
|
||||
|
||||
class TextStyle(object):
|
||||
|
||||
def __init__(self, tok):
|
||||
self.bold = tok.bold
|
||||
self.italic = tok.italic
|
||||
self.font_name = tok.font_name
|
||||
self.font_size = tok.font_size
|
||||
self.color = tok.font_color
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, self.__class__):
|
||||
for a in ('font_size', 'bold', 'italic', 'font_name', 'color'):
|
||||
if getattr(self, a) != getattr(other, a):
|
||||
return False
|
||||
return True
|
||||
return False
|
||||
|
||||
def to_css(self, inline=False):
|
||||
fw = 'bold' if self.bold else 'normal'
|
||||
fs = 'italic' if self.italic else 'normal'
|
||||
fsz = '%dpt'%self.font_size
|
||||
props = ['font-weight: %s;'%fw, 'font-style: %s;'%fs, 'font-size: %s;'%fsz,
|
||||
'color: rgb(%d, %d, %d);'%self.color]
|
||||
joiner = ' '
|
||||
if not inline:
|
||||
joiner = '\n'
|
||||
props = ['{'] + props + ['}']
|
||||
return joiner.join(props)
|
||||
|
||||
class Token(object):
|
||||
|
||||
def __init__(self, attrs):
|
||||
for a in ('x', 'y', 'width', 'height', 'rotation', 'angle', 'font-size'):
|
||||
setattr(self, a.replace('-', '_'), float(attrs[a]))
|
||||
for a in ('bold', 'italic'):
|
||||
setattr(self, a, attrs[a]=='yes')
|
||||
self.font_name = attrs['font-name']
|
||||
fc = re.compile(r'#([a-f0-9]{2})([a-f0-9]{2})([a-f0-9]{2})', re.IGNORECASE)
|
||||
fc = fc.match(attrs['font-color'])
|
||||
self.font_color = (int(fc.group(1), 16), int(fc.group(2), 16), int(fc.group(3), 16))
|
||||
self.id = attrs['id']
|
||||
self.text = u''
|
||||
self.style = TextStyle(self)
|
||||
|
||||
def handle_char_data(self, data):
|
||||
self.text += data
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, self.__class__):
|
||||
for a in ('rotation', 'angle', 'font_size', 'bold', 'italic', 'font_name', 'font_color'):
|
||||
if getattr(self, a) != getattr(other, a):
|
||||
return False
|
||||
return True
|
||||
return False
|
||||
|
||||
def to_xhtml(self):
|
||||
if self.style is not None:
|
||||
ans = u'<span style="%s">%s</span>'%(self.style.to_css(inline=True), self.text)
|
||||
else:
|
||||
ans = self.text
|
||||
return ans
|
||||
|
||||
class PDFDocument(object):
|
||||
|
||||
SKIPPED_TAGS = ('DOCUMENT', 'METADATA', 'PDFFILENAME', 'PROCESS', 'VERSION',
|
||||
'COMMENT', 'CREATIONDATE')
|
||||
|
||||
def __init__(self, filename):
|
||||
parser = xml.parsers.expat.ParserCreate('UTF-8')
|
||||
parser.buffer_text = True
|
||||
parser.returns_unicode = True
|
||||
parser.StartElementHandler = self.start_element
|
||||
parser.EndElementHandler = self.end_element
|
||||
|
||||
self.pages = []
|
||||
self.current_page = None
|
||||
self.current_token = None
|
||||
|
||||
src = open(filename, 'rb').read()
|
||||
self.parser = parser
|
||||
parser.Parse(src)
|
||||
|
||||
|
||||
def start_element(self, name, attrs):
|
||||
if name == 'TOKEN':
|
||||
self.current_token = Token(attrs)
|
||||
self.parser.CharacterDataHandler = self.current_token.handle_char_data
|
||||
elif name == 'TEXT':
|
||||
text = Text(attrs)
|
||||
if self.current_page.current_line is None:
|
||||
self.current_page.current_line = Line()
|
||||
self.current_page.current_line.append(text)
|
||||
else:
|
||||
y, height = self.current_page.current_line[0].y, self.current_page.current_line[0].height
|
||||
if y == text.y or y+height == text.y + text.height:
|
||||
self.current_page.current_line.append(text)
|
||||
else:
|
||||
self.current_page.end_line()
|
||||
self.current_page.current_line = Line()
|
||||
self.current_page.current_line.append(text)
|
||||
elif name == 'PAGE':
|
||||
self.current_page = Page(attrs)
|
||||
elif name.lower() == 'xi:include':
|
||||
print 'WARNING: Skipping vector image'
|
||||
elif name in self.SKIPPED_TAGS:
|
||||
pass
|
||||
else:
|
||||
print 'WARNING: Unhandled element', name
|
||||
|
||||
def end_element(self, name):
|
||||
if name == 'TOKEN':
|
||||
if self.current_token.angle == 0 and self.current_token.rotation == 0:
|
||||
self.current_page.current_line[-1].add(self.current_token)
|
||||
self.current_token = None
|
||||
self.parser.CharacterDataHandler = None
|
||||
elif name == 'PAGE':
|
||||
self.current_page.finalize()
|
||||
self.pages.append(self.current_page)
|
||||
self.current_page = None
|
||||
|
||||
|
||||
def to_xhtml(self):
|
||||
header = u'''\
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
||||
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://www.w3.org/MarkUp/SCHEMA/xhtml11.xsd" >
|
||||
<head>
|
||||
<style type="text/css">
|
||||
%(style)s
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
%(body)s
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
res = []
|
||||
para = []
|
||||
styles = []
|
||||
for page in self.pages:
|
||||
res.append(u'<a name="%s" />'%page.id)
|
||||
for group in page.groups:
|
||||
if group.is_header or group.is_footer:
|
||||
continue
|
||||
if group.style is not None:
|
||||
styles.append(u'.%s %s\n'%(group.id, group.style.to_css()))
|
||||
for line in group.lines:
|
||||
if line.is_para_start:
|
||||
indent = group.left_margin - line.left
|
||||
if para:
|
||||
res.append(u'<p style="text-indent: %dpt">%s</p>'%(indent, ''.join(para)))
|
||||
para = []
|
||||
para.append(line.to_xhtml(group.id))
|
||||
if page.page_break_after:
|
||||
res.append(u'<br style="page-break-after:always" />')
|
||||
if para:
|
||||
res.append(u'<p>%s</p>'%(''.join(para)))
|
||||
para = []
|
||||
|
||||
return (header%dict(style='\n'.join(styles), body='\n'.join(res))).encode('utf-8')
|
||||
|
||||
class PDFConverter(object):
|
||||
|
||||
@classmethod
|
||||
def generate_xml(cls, pathtopdf, logger):
|
||||
pathtopdf = os.path.abspath(pathtopdf)
|
||||
tdir = tempfile.mkdtemp('pdf2xml', __appname__)
|
||||
atexit.register(shutil.rmtree, tdir)
|
||||
xmlfile = os.path.basename(pathtopdf)+'.xml'
|
||||
os.chdir(tdir)
|
||||
cmd = PDFTOXML + ' -outline "%s" "%s"'%(pathtopdf, xmlfile)
|
||||
p = subprocess.Popen(cmd, shell=True, stderr=subprocess.STDOUT,
|
||||
stdout=subprocess.PIPE)
|
||||
log = p.stdout.read()
|
||||
ret = p.wait()
|
||||
if ret != 0:
|
||||
raise ConversionError, log
|
||||
xmlfile = os.path.join(tdir, xmlfile)
|
||||
if os.stat(xmlfile).st_size < 20:
|
||||
raise ConversionError(os.path.basename(pathtopdf) + ' does not allow copying of text.')
|
||||
return xmlfile
|
||||
|
||||
|
||||
def __init__(self, pathtopdf, logger, opts):
|
||||
self.cwd = os.getcwdu()
|
||||
self.logger = logger
|
||||
self.opts = opts
|
||||
try:
|
||||
self.logger.info('Converting PDF to XML')
|
||||
self.xmlfile = self.generate_xml(pathtopdf, self.logger)
|
||||
self.tdir = os.path.dirname(self.xmlfile)
|
||||
self.data_dir = self.xmlfile + '_data'
|
||||
outline_file = self.xmlfile.rpartition('.')[0]+'_outline.xml'
|
||||
self.logger.info('Parsing XML')
|
||||
self.document = PDFDocument(self.xmlfile)
|
||||
self.outline = parse(outline_file)
|
||||
finally:
|
||||
os.chdir(self.cwd)
|
||||
|
||||
def convert(self, output_dir):
|
||||
doc = self.document.to_xhtml()
|
||||
open(os.path.join(output_dir, 'document.html'), 'wb').write(doc)
|
||||
|
||||
|
||||
|
||||
def option_parser():
|
||||
parser = OptionParser(usage=\
|
||||
'''
|
||||
%prog [options] myfile.pdf
|
||||
|
||||
Convert a PDF file to a HTML file.
|
||||
''')
|
||||
parser.add_option('-o', '--output-dir', default='.',
|
||||
help=_('Path to output directory in which to create the HTML file. Defaults to current directory.'))
|
||||
parser.add_option('--verbose', default=False, action='store_true',
|
||||
help=_('Be more verbose.'))
|
||||
return parser
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args()
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('pdf2html')
|
||||
setup_cli_handlers(logger, level)
|
||||
if len(args) != 1:
|
||||
parser.print_help()
|
||||
print _('You must specify a single PDF file.')
|
||||
return 1
|
||||
options.output_dir = os.path.abspath(options.output_dir)
|
||||
converter = PDFConverter(os.path.abspath(args[0]), logger, options)
|
||||
converter.convert(options.output_dir)
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -1,2 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
@ -1,112 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
"""
|
||||
Convert .txt files to .lrf
|
||||
"""
|
||||
import os, sys, codecs, logging, re, shutil
|
||||
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from calibre.ebooks import ConversionError
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
|
||||
from calibre.ebooks.markdown import markdown
|
||||
from calibre import setup_cli_handlers
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf import OPFCreator
|
||||
|
||||
def option_parser():
|
||||
parser = lrf_option_parser(
|
||||
_('''%prog [options] mybook.txt
|
||||
|
||||
|
||||
%prog converts mybook.txt to mybook.lrf'''))
|
||||
parser.add_option('--debug-html-generation', action='store_true', default=False,
|
||||
dest='debug_html_generation', help=_('Print generated HTML to stdout and quit.'))
|
||||
return parser
|
||||
|
||||
def fix_image_includes(sdir, tdir, match):
|
||||
path = match.group(1).split('/')
|
||||
src = os.path.join(sdir, *path)
|
||||
dest = os.path.join(tdir, *path)
|
||||
p = os.path.dirname(dest)
|
||||
if not os.path.exists(p):
|
||||
os.makedirs(p)
|
||||
if not os.path.exists(dest):
|
||||
shutil.copyfile(src, dest)
|
||||
|
||||
|
||||
def generate_html(txtfile, encoding, tdir):
|
||||
'''
|
||||
Convert txtfile to html and return a PersistentTemporaryFile object pointing
|
||||
to the file with the HTML.
|
||||
'''
|
||||
txtfile = os.path.abspath(txtfile)
|
||||
enc = encoding
|
||||
if not encoding:
|
||||
encodings = ['cp1252', 'latin-1', 'utf8', 'iso-8859-1', 'koi8_r', 'koi8_u']
|
||||
txt, enc = None, None
|
||||
for encoding in encodings:
|
||||
try:
|
||||
txt = codecs.open(txtfile, 'rb', encoding).read()
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
enc = encoding
|
||||
break
|
||||
if txt == None:
|
||||
raise ConversionError, 'Could not detect encoding of %s'%(txtfile,)
|
||||
else:
|
||||
txt = codecs.open(txtfile, 'rb', enc).read()
|
||||
|
||||
print 'Converting text to HTML...'
|
||||
md = markdown.Markdown(
|
||||
extensions=['footnotes', 'tables', 'toc'],
|
||||
safe_mode=False,
|
||||
)
|
||||
html = '<html><body>'+md.convert(txt)+'</body></html>'
|
||||
for match in re.finditer(r'<img\s+[^>]*src="([^"]+)"', html):
|
||||
fix_image_includes(os.path.dirname(txtfile), tdir, match)
|
||||
p = os.path.join(tdir, 'index.html')
|
||||
open(p, 'wb').write(html.encode('utf-8'))
|
||||
mi = MetaInformation(os.path.splitext(os.path.basename(txtfile))[0], [_('Unknown')])
|
||||
opf = OPFCreator(tdir, mi)
|
||||
opf.create_manifest([(os.path.join(tdir, 'index.html'), None)])
|
||||
opf.create_spine([os.path.join(tdir, 'index.html')])
|
||||
opf.render(open(os.path.join(tdir, 'metadata.opf'), 'wb'))
|
||||
return p
|
||||
|
||||
def process_file(path, options, logger=None):
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('txt2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
txt = os.path.abspath(os.path.expanduser(path))
|
||||
if not hasattr(options, 'debug_html_generation'):
|
||||
options.debug_html_generation = False
|
||||
tdir = PersistentTemporaryDirectory('_txt2lrf')
|
||||
htmlfile = generate_html(txt, options.encoding, tdir)
|
||||
options.encoding = 'utf-8'
|
||||
if not options.debug_html_generation:
|
||||
options.force_page_break = 'h2'
|
||||
if not options.output:
|
||||
ext = '.lrs' if options.lrs else '.lrf'
|
||||
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
if not options.title:
|
||||
options.title = os.path.splitext(os.path.basename(path))[0]
|
||||
html_process_file(htmlfile, options, logger)
|
||||
else:
|
||||
print open(htmlfile, 'rb').read()
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) != 2:
|
||||
parser.print_help()
|
||||
print
|
||||
print 'No txt file specified'
|
||||
return 1
|
||||
process_file(args[1], options, logger)
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -1,89 +0,0 @@
|
||||
Demonstration of `txt2lrf`
|
||||
==========================
|
||||
|
||||
`txt2lrf` provides a convenient way to create LRF files with good formatting.
|
||||
`txt2lrf` recognizes a simple markup language called *markdown*.
|
||||
|
||||
The idea is to provide a lightweight markup that can be used to create
|
||||
TXT files that can be read by themselves or automatically converted to LRF.
|
||||
[{@name=toc}]()
|
||||
|
||||
<br /><br />
|
||||
|
||||
///Table of Contents///
|
||||
|
||||
|
||||
Text formatting
|
||||
---------------
|
||||
**Bold** and *italic* text is easily specified.
|
||||
|
||||
> Blockquotes are also very simple to specify.
|
||||
> This is a basic blockquote paragraph. I absolutely
|
||||
> love block quotes don't you?
|
||||
|
||||
This is a preformatted code block. No formatting rules are applied to text in this block and it is rendered in a monospaced font.
|
||||
|
||||
|
||||
For details on the text formatting syntax visit
|
||||
|
||||
http://daringfireball.net/projects/markdown/syntax
|
||||
___
|
||||
[Table of Contents](#toc)
|
||||
|
||||
Lists
|
||||
-----
|
||||
Both ordered and unordered lists are supported.
|
||||
|
||||
|
||||
### Unordered lists
|
||||
|
||||
+ What a
|
||||
+ *nice*
|
||||
+ list
|
||||
|
||||
|
||||
|
||||
### Ordered lists
|
||||
|
||||
1. One
|
||||
2. Two
|
||||
3. Three
|
||||
|
||||
**Note:** Nested lists are not supported
|
||||
|
||||
___
|
||||
[Table of Contents](#toc)
|
||||
|
||||
Tables
|
||||
------
|
||||
|
||||
Simple tables are easily generated
|
||||
|
||||
| |* Col 1 *|* Col 2 *|
|
||||
|* Row 1 *| (1, 1) | (1, 2) |
|
||||
|* Row 2 *| (2, 1) | (2, 2) |
|
||||
|
||||
**Note:** Nested tables are not supported
|
||||
|
||||
___
|
||||
[Table of Contents](#toc)
|
||||
|
||||
Images
|
||||
------
|
||||
|
||||
`txt2lrf` also has support for inline images like
|
||||
 this one.
|
||||
|
||||
___
|
||||
[Table of Contents](#toc)
|
||||
|
||||
Automatic TOC Creation
|
||||
----------------------
|
||||
|
||||
By inserting `///Table of Contents///` into the text at some point
|
||||
a table of contents is automatically generated with links that point
|
||||
to all headings underlined with `-------`.
|
||||
|
||||
___
|
||||
[Table of Contents](#toc)
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 2.0 KiB |
@ -1,6 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
|
||||
builtin_profiles = []
|
||||
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]
|
@ -1,183 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''Convert websites into LRF files.'''
|
||||
|
||||
import sys, tempfile, shutil, os, logging, imp, inspect, re
|
||||
from urlparse import urlsplit
|
||||
|
||||
from calibre import __appname__, setup_cli_handlers, CommandLineError, strftime
|
||||
from calibre.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file
|
||||
|
||||
from calibre.web.fetch.simple import create_fetcher
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile, create_class
|
||||
from calibre.ebooks.lrf.web import builtin_profiles, available_profiles
|
||||
|
||||
|
||||
def option_parser():
|
||||
parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n'''
|
||||
'''%prog downloads a site from the web and converts it '''
|
||||
'''into a LRF file for use with the SONY Reader. '''
|
||||
'''website_profile is one of '''+str(available_profiles)+\
|
||||
''' If you specify a website_profile of default or do not specify '''
|
||||
'''it, you must specify the --url option.'''
|
||||
)
|
||||
|
||||
parser.add_option('-u', '--url', dest='url', default=None,
|
||||
help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
|
||||
parser.add_option('--user-profile', default=None,
|
||||
help='Path to a python file containing a user created profile. For help visit http://%s.kovidgoyal.net/wiki/UserProfiles'%__appname__)
|
||||
parser.add_option('--username', dest='username', default=None,
|
||||
help='Specify the username to be used while downloading. Only used if the profile supports it.')
|
||||
parser.add_option('--password', dest='password', default=None,
|
||||
help='Specify the password to be used while downloading. Only used if the profile supports it.')
|
||||
parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout,
|
||||
default=None, type='int', dest='timeout')
|
||||
parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout,
|
||||
default=None, type='int', dest='max_recursions')
|
||||
parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files',
|
||||
help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %d'%DefaultProfile.timeout)
|
||||
parser.add_option('--delay', default=None, dest='delay', type='int',
|
||||
help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout)
|
||||
parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
|
||||
help='Do not download CSS stylesheets.', dest='no_stylesheets')
|
||||
parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append',
|
||||
help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
|
||||
parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
|
||||
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
|
||||
parser.add_option('--keep-downloaded-files', default=False, action='store_true',
|
||||
help='''Do not delete the downloaded files after creating the LRF''')
|
||||
return parser
|
||||
|
||||
def fetch_website(options, logger):
|
||||
tdir = tempfile.mkdtemp(prefix=__appname__+'_', suffix='_web2lrf')
|
||||
options.dir = tdir
|
||||
fetcher = create_fetcher(options, logger)
|
||||
fetcher.preprocess_regexps = options.preprocess_regexps
|
||||
return fetcher.start_fetch(options.url), tdir
|
||||
|
||||
def create_lrf(htmlfile, options, logger):
|
||||
if not options.author or options.author.lower() == 'unknown':
|
||||
options.author = __appname__
|
||||
options.header = True
|
||||
if options.output:
|
||||
options.output = os.path.abspath(os.path.expanduser(options.output))
|
||||
else:
|
||||
options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf')))
|
||||
|
||||
process_file(htmlfile, options, logger)
|
||||
|
||||
def process_profile(args, options, logger=None):
|
||||
tdir = None
|
||||
try:
|
||||
if logger is None:
|
||||
level = logging.DEBUG if options.verbose else logging.INFO
|
||||
logger = logging.getLogger('web2lrf')
|
||||
setup_cli_handlers(logger, level)
|
||||
index = -1
|
||||
|
||||
if len(args) == 2 and re.search(r'class\s+\S+\(\S+\)\s*\:', args[1]):
|
||||
profile = create_class(args[1])
|
||||
else:
|
||||
if options.user_profile is not None:
|
||||
path = os.path.abspath(options.user_profile)
|
||||
name = os.path.splitext(os.path.basename(path))[0]
|
||||
res = imp.find_module(name, [os.path.dirname(path)])
|
||||
module = imp.load_module(name, *res)
|
||||
classes = inspect.getmembers(module,
|
||||
lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\
|
||||
and x is not DefaultProfile and x is not FullContentProfile)
|
||||
if not classes:
|
||||
raise CommandLineError('Invalid user profile '+path)
|
||||
builtin_profiles.append(classes[0][1])
|
||||
available_profiles.append(name)
|
||||
if len(args) < 2:
|
||||
args.append(name)
|
||||
args[1] = name
|
||||
index = -1
|
||||
if len(args) == 2:
|
||||
try:
|
||||
if isinstance(args[1], basestring):
|
||||
if args[1] != 'default':
|
||||
index = available_profiles.index(args[1])
|
||||
except ValueError:
|
||||
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles))
|
||||
else:
|
||||
raise CommandLineError('Only one profile at a time is allowed.')
|
||||
profile = DefaultProfile if index == -1 else builtin_profiles[index]
|
||||
|
||||
|
||||
|
||||
profile = profile(logger, options.verbose, options.username, options.password)
|
||||
if profile.browser is not None:
|
||||
options.browser = profile.browser
|
||||
|
||||
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
|
||||
val = getattr(options, opt)
|
||||
if val is None:
|
||||
setattr(options, opt, getattr(profile, opt))
|
||||
|
||||
if not options.url:
|
||||
options.url = profile.url
|
||||
|
||||
if not options.url:
|
||||
raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,))
|
||||
|
||||
if not options.title:
|
||||
title = profile.title
|
||||
if not title:
|
||||
title = urlsplit(options.url).netloc
|
||||
options.title = title + strftime(profile.timefmt)
|
||||
|
||||
options.match_regexps += profile.match_regexps
|
||||
options.preprocess_regexps = profile.preprocess_regexps
|
||||
options.filter_regexps += profile.filter_regexps
|
||||
|
||||
options.encoding = profile.encoding if options.encoding is None else options.encoding
|
||||
|
||||
if len(args) == 2 and args[1] != 'default':
|
||||
options.anchor_ids = False
|
||||
|
||||
htmlfile, tdir = fetch_website(options, logger)
|
||||
options.encoding = 'utf-8'
|
||||
cwd = os.getcwd()
|
||||
if not options.output:
|
||||
title = options.title.encode(sys.getfilesystemencoding()) if isinstance(options.title, unicode) else options.title
|
||||
options.output = os.path.join(cwd, options.title+('.lrs' if options.lrs else '.lrf'))
|
||||
if not os.path.isabs(options.output):
|
||||
options.output = os.path.join(cwd, options.output)
|
||||
|
||||
option_parser().parse_args(profile.html2lrf_options, options)
|
||||
|
||||
try:
|
||||
os.chdir(os.path.dirname(htmlfile))
|
||||
create_lrf(os.path.basename(htmlfile), options, logger)
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
finally:
|
||||
try:
|
||||
profile.cleanup()
|
||||
except:
|
||||
pass
|
||||
if tdir and os.path.isdir(tdir):
|
||||
if options.keep_downloaded_files:
|
||||
print 'Downloaded files in ', tdir
|
||||
else:
|
||||
shutil.rmtree(tdir)
|
||||
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
if len(args) > 2 or (len(args) == 1 and not options.user_profile):
|
||||
parser.print_help()
|
||||
return 1
|
||||
try:
|
||||
process_profile(args, options, logger=logger)
|
||||
except CommandLineError, err:
|
||||
print >>sys.stderr, err
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -1,572 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Contains the Base Profiles that can be used to easily create profiles to download
|
||||
particular websites.
|
||||
'''
|
||||
|
||||
import tempfile, time, calendar, re, operator, atexit, shutil, os
|
||||
from htmlentitydefs import name2codepoint
|
||||
from email.utils import formatdate
|
||||
|
||||
from calibre import __appname__, iswindows, browser, strftime
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, CData, Tag
|
||||
|
||||
|
||||
class DefaultProfile(object):
|
||||
|
||||
#: The title to use for the LRF file
|
||||
#: @type: string
|
||||
title = 'Default Profile'
|
||||
|
||||
#: Maximum number of articles to download from each feed
|
||||
#: @type: integer
|
||||
max_articles_per_feed = 10
|
||||
|
||||
#: If True process the <description> element of the feed as HTML
|
||||
#: @type: boolean
|
||||
html_description = True
|
||||
|
||||
#: How many days old should the oldest article downloaded from the feeds be
|
||||
#: @type: integer
|
||||
oldest_article = 7
|
||||
|
||||
#: Recommend frequency at which to download this profile. In days.
|
||||
recommended_frequency = 7
|
||||
|
||||
#: Number of levels of links to follow
|
||||
#: @type: integer
|
||||
max_recursions = 1
|
||||
|
||||
#: Maximum number of files to download
|
||||
#: @type: integer
|
||||
max_files = 3000
|
||||
|
||||
#: Delay between consecutive downloads in seconds
|
||||
#: @type: integer
|
||||
delay = 0
|
||||
|
||||
#: Timeout for fetching files from server in seconds
|
||||
#: @type: integer
|
||||
timeout = 10
|
||||
|
||||
#: The format string for the date shown on the first page
|
||||
#: @type: string
|
||||
timefmt = ' [%a %d %b %Y]'
|
||||
|
||||
#: The order of elements to search for a URL when parsing the RSS feed. You
|
||||
#: can replace these elements by completely arbitrary elements to customize
|
||||
#: feed processing.
|
||||
#: @type: list of strings
|
||||
url_search_order = ['guid', 'link']
|
||||
|
||||
#: The format string used to parse the publication date in the RSS feed.
|
||||
#: If set to None some default heuristics are used, these may fail,
|
||||
#: in which case set this to the correct string or re-implement
|
||||
#: L{DefaultProfile.strptime} in your subclass.
|
||||
#: @type: string or None
|
||||
pubdate_fmt = None
|
||||
|
||||
#: If True will look for a publication date for each article.
|
||||
#: If False assumes the publication date is the current time.
|
||||
#: @type: boolean
|
||||
use_pubdate = True,
|
||||
|
||||
#: Max number of characters in the short description.
|
||||
#: Used by L{FullContentProfile}
|
||||
#: @type: integer
|
||||
summary_length = 500
|
||||
|
||||
#: If True stylesheets are not downloaded and processed
|
||||
#: Convenient flag to disable loading of stylesheets for websites
|
||||
#: that have overly complex stylesheets unsuitable for conversion
|
||||
#: to ebooks formats
|
||||
#: @type: boolean
|
||||
no_stylesheets = False
|
||||
|
||||
#: If False articles with the same title in the same feed
|
||||
#: are not downloaded multiple times
|
||||
#: @type: boolean
|
||||
allow_duplicates = False
|
||||
|
||||
#: If True the GUI will ask the user for a username and password
|
||||
#: to use while downloading
|
||||
#: @type: boolean
|
||||
needs_subscription = False
|
||||
|
||||
#: Specify an override encoding for sites that have an incorrect
|
||||
#: charset specification. THe most common being specifying latin1 and
|
||||
#: using cp1252
|
||||
encoding = None
|
||||
|
||||
#: List of regular expressions that determines which links to follow
|
||||
#: If empty, it is ignored.
|
||||
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
|
||||
#: @type: list of strings
|
||||
match_regexps = []
|
||||
|
||||
#: List of regular expressions that determines which links to ignore
|
||||
#: If empty it is ignored
|
||||
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
|
||||
#: @type: list of strings
|
||||
filter_regexps = []
|
||||
|
||||
#: List of options to pass to html2lrf, to customize conversion
|
||||
#: to LRF
|
||||
#: @type: list of strings
|
||||
html2lrf_options = []
|
||||
|
||||
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the
|
||||
#: list should be a two element tuple. The first element of the tuple should
|
||||
#: be a compiled regular expression and the second a callable that takes
|
||||
#: a single match object and returns a string to replace the match.
|
||||
#: @type: list of tuples
|
||||
preprocess_regexps = []
|
||||
|
||||
# See the built-in profiles for examples of these settings.
|
||||
|
||||
#: The URL of the website
|
||||
#: @type: string
|
||||
url = ''
|
||||
|
||||
feeds = []
|
||||
CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL)
|
||||
|
||||
def get_feeds(self):
|
||||
'''
|
||||
Return a list of RSS feeds to fetch for this profile. Each element of the list
|
||||
must be a 2-element tuple of the form (title, url).
|
||||
'''
|
||||
if not self.feeds:
|
||||
raise NotImplementedError
|
||||
return self.feeds
|
||||
|
||||
@classmethod
|
||||
def print_version(cls, url):
|
||||
'''
|
||||
Take a URL pointing to an article and returns the URL pointing to the
|
||||
print version of the article.
|
||||
'''
|
||||
return url
|
||||
|
||||
@classmethod
|
||||
def get_browser(cls):
|
||||
'''
|
||||
Return a browser instance used to fetch documents from the web.
|
||||
|
||||
If your profile requires that you login first, override this method
|
||||
in your subclass. See for example the nytimes profile.
|
||||
'''
|
||||
return browser()
|
||||
|
||||
|
||||
|
||||
|
||||
def __init__(self, logger, verbose=False, username=None, password=None, lrf=True):
|
||||
self.logger = logger
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.verbose = verbose
|
||||
self.lrf = lrf
|
||||
self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_')
|
||||
self.browser = self.get_browser()
|
||||
try:
|
||||
self.url = 'file:'+ ('' if iswindows else '//') + self.build_index()
|
||||
except NotImplementedError:
|
||||
self.url = None
|
||||
atexit.register(cleanup, self.temp_dir)
|
||||
|
||||
def build_index(self):
|
||||
'''Build an RSS based index.html'''
|
||||
articles = self.parse_feeds()
|
||||
encoding = 'utf-8' if self.encoding is None else self.encoding
|
||||
def build_sub_index(title, items):
|
||||
ilist = ''
|
||||
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
|
||||
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
|
||||
for item in items:
|
||||
if not item.has_key('date'):
|
||||
item['date'] = time.strftime('%a, %d %b', time.localtime())
|
||||
ilist += li%item
|
||||
return u'''\
|
||||
<html>
|
||||
<body>
|
||||
<h2>%(title)s</h2>
|
||||
<ul>
|
||||
%(items)s
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
'''%dict(title=title, items=ilist.rstrip())
|
||||
|
||||
cnum = 0
|
||||
clist = ''
|
||||
categories = articles.keys()
|
||||
categories.sort()
|
||||
for category in categories:
|
||||
cnum += 1
|
||||
cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
|
||||
prefix = 'file:' if iswindows else ''
|
||||
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
|
||||
src = build_sub_index(category, articles[category])
|
||||
open(cfile, 'wb').write(src.encode(encoding))
|
||||
|
||||
title = self.title
|
||||
if not isinstance(title, unicode):
|
||||
title = unicode(title, 'utf-8', 'replace')
|
||||
src = u'''\
|
||||
<html>
|
||||
<body>
|
||||
<h1>%(title)s</h1>
|
||||
<div style='text-align: right; font-weight: bold'>%(date)s</div>
|
||||
<ul>
|
||||
%(categories)s
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
'''%dict(date=strftime('%a, %d %B, %Y'),
|
||||
categories=clist, title=title)
|
||||
index = os.path.join(self.temp_dir, 'index.html')
|
||||
open(index, 'wb').write(src.encode(encoding))
|
||||
|
||||
return index
|
||||
|
||||
|
||||
@classmethod
|
||||
def tag_to_string(cls, tag, use_alt=True):
|
||||
'''
|
||||
Convenience method to take a BeautifulSoup Tag and extract the text from it
|
||||
recursively, including any CDATA sections and alt tag attributes.
|
||||
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content
|
||||
@type use_alt: boolean
|
||||
@return: A unicode (possibly empty) object
|
||||
@rtype: unicode string
|
||||
'''
|
||||
if not tag:
|
||||
return ''
|
||||
if isinstance(tag, basestring):
|
||||
return tag
|
||||
strings = []
|
||||
for item in tag.contents:
|
||||
if isinstance(item, (NavigableString, CData)):
|
||||
strings.append(item.string)
|
||||
elif isinstance(item, Tag):
|
||||
res = cls.tag_to_string(item)
|
||||
if res:
|
||||
strings.append(res)
|
||||
elif use_alt and item.has_key('alt'):
|
||||
strings.append(item['alt'])
|
||||
return u''.join(strings)
|
||||
|
||||
def get_article_url(self, item):
|
||||
'''
|
||||
Return the article URL given an item Tag from a feed, or None if no valid URL is found
|
||||
@type item: BeatifulSoup.Tag
|
||||
@param item: A BeautifulSoup Tag instance corresponding to the <item> tag from a feed.
|
||||
@rtype: string or None
|
||||
'''
|
||||
url = None
|
||||
for element in self.url_search_order:
|
||||
url = item.find(element.lower())
|
||||
if url:
|
||||
break
|
||||
return url
|
||||
|
||||
|
||||
def parse_feeds(self, require_url=True):
|
||||
'''
|
||||
Create list of articles from a list of feeds.
|
||||
@param require_url: If True skip articles that don't have a link to a HTML page with the full article contents.
|
||||
@type require_url: boolean
|
||||
@rtype: dictionary
|
||||
@return: A dictionary whose keys are feed titles and whose values are each
|
||||
a list of dictionaries. Each list contains dictionaries of the form::
|
||||
{
|
||||
'title' : article title,
|
||||
'url' : URL of print version,
|
||||
'date' : The publication date of the article as a string,
|
||||
'description' : A summary of the article
|
||||
'content' : The full article (can be an empty string). This is used by FullContentProfile
|
||||
}
|
||||
'''
|
||||
added_articles = {}
|
||||
feeds = self.get_feeds()
|
||||
articles = {}
|
||||
for title, url in feeds:
|
||||
try:
|
||||
src = self.browser.open(url).read()
|
||||
except Exception, err:
|
||||
self.logger.error('Could not fetch feed: %s\nError: %s'%(url, err))
|
||||
if self.verbose:
|
||||
self.logger.exception(' ')
|
||||
continue
|
||||
|
||||
articles[title] = []
|
||||
added_articles[title] = []
|
||||
soup = BeautifulStoneSoup(src)
|
||||
for item in soup.findAll('item'):
|
||||
try:
|
||||
atitle = item.find('title')
|
||||
if not atitle:
|
||||
continue
|
||||
|
||||
atitle = self.tag_to_string(atitle)
|
||||
if self.use_pubdate:
|
||||
pubdate = item.find('pubdate')
|
||||
if not pubdate:
|
||||
pubdate = item.find('dc:date')
|
||||
if not pubdate or not pubdate.string:
|
||||
pubdate = formatdate()
|
||||
pubdate = self.tag_to_string(pubdate)
|
||||
pubdate = pubdate.replace('+0000', 'GMT')
|
||||
|
||||
|
||||
url = self.get_article_url(item)
|
||||
url = self.tag_to_string(url)
|
||||
if require_url and not url:
|
||||
self.logger.debug('Skipping article %s as it does not have a link url'%atitle)
|
||||
continue
|
||||
purl = url
|
||||
try:
|
||||
purl = self.print_version(url)
|
||||
except Exception, err:
|
||||
self.logger.debug('Skipping %s as could not find URL for print version. Error:\n%s'%(url, err))
|
||||
continue
|
||||
|
||||
content = item.find('content:encoded')
|
||||
if not content:
|
||||
content = item.find('description')
|
||||
if content:
|
||||
content = self.process_html_description(content, strip_links=False)
|
||||
else:
|
||||
content = ''
|
||||
|
||||
d = {
|
||||
'title' : atitle,
|
||||
'url' : purl,
|
||||
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
|
||||
'date' : pubdate if self.use_pubdate else formatdate(),
|
||||
'content' : content,
|
||||
}
|
||||
delta = time.time() - d['timestamp']
|
||||
if not self.allow_duplicates:
|
||||
if d['title'] in added_articles[title]:
|
||||
continue
|
||||
added_articles[title].append(d['title'])
|
||||
if delta > self.oldest_article*3600*24:
|
||||
continue
|
||||
|
||||
except Exception, err:
|
||||
if self.verbose:
|
||||
self.logger.exception('Error parsing article:\n%s'%(item,))
|
||||
continue
|
||||
try:
|
||||
desc = ''
|
||||
for c in item.findAll('description'):
|
||||
desc = self.tag_to_string(c)
|
||||
if desc:
|
||||
break
|
||||
d['description'] = self.process_html_description(desc) if self.html_description else desc.string
|
||||
except:
|
||||
d['description'] = ''
|
||||
articles[title].append(d)
|
||||
articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
|
||||
articles[title] = articles[title][:self.max_articles_per_feed+1]
|
||||
#for item in articles[title]:
|
||||
# item.pop('timestamp')
|
||||
if not articles[title]:
|
||||
articles.pop(title)
|
||||
return articles
|
||||
|
||||
|
||||
def cleanup(self):
|
||||
'''
|
||||
Called after LRF file has been generated. Use it to do any cleanup like
|
||||
logging out of subscription sites, etc.
|
||||
'''
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def process_html_description(cls, tag, strip_links=True):
|
||||
'''
|
||||
Process a <description> tag that contains HTML markup, either
|
||||
entity encoded or escaped in a CDATA section.
|
||||
@return: HTML
|
||||
@rtype: string
|
||||
'''
|
||||
src = '\n'.join(tag.contents) if hasattr(tag, 'contents') else tag
|
||||
match = cls.CDATA_PAT.match(src.lstrip())
|
||||
if match:
|
||||
src = match.group(1)
|
||||
else:
|
||||
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
||||
for e in replaced_entities:
|
||||
ent = '&'+e+';'
|
||||
src = src.replace(ent, unichr(name2codepoint[e]))
|
||||
if strip_links:
|
||||
src = re.compile(r'<a.*?>(.*?)</a>', re.IGNORECASE|re.DOTALL).sub(r'\1', src)
|
||||
|
||||
return src
|
||||
|
||||
|
||||
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
|
||||
FULL_DAY_MAP = dict(Sunday=0, Monday=1, Tueday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6)
|
||||
MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
|
||||
FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,
|
||||
July=7, August=8, September=9, October=10,
|
||||
November=11, December=12)
|
||||
|
||||
@classmethod
|
||||
def strptime(cls, src):
|
||||
'''
|
||||
Take a string and return the date that string represents, in UTC as
|
||||
an epoch (i.e. number of seconds since Jan 1, 1970). This function uses
|
||||
a bunch of heuristics and is a prime candidate for being overridden in a
|
||||
subclass.
|
||||
@param src: Timestamp as a string
|
||||
@type src: string
|
||||
@return: time ans a epoch
|
||||
@rtype: number
|
||||
'''
|
||||
delta = 0
|
||||
zone = re.search(r'\s*(\+\d\d\:{0,1}\d\d)', src)
|
||||
if zone:
|
||||
delta = zone.group(1)
|
||||
hrs, mins = int(delta[1:3]), int(delta[-2:].rstrip())
|
||||
delta = 60*(hrs*60 + mins) * (-1 if delta.startswith('-') else 1)
|
||||
src = src.replace(zone.group(), '')
|
||||
if cls.pubdate_fmt is None:
|
||||
src = src.strip().split()
|
||||
try:
|
||||
src[0] = str(cls.DAY_MAP[src[0][:-1]])+','
|
||||
except KeyError:
|
||||
src[0] = str(cls.FULL_DAY_MAP[src[0][:-1]])+','
|
||||
try:
|
||||
src[2] = str(cls.MONTH_MAP[src[2]])
|
||||
except KeyError:
|
||||
src[2] = str(cls.FULL_MONTH_MAP[src[2]])
|
||||
fmt = '%w, %d %m %Y %H:%M:%S'
|
||||
src = src[:5] # Discard extra information
|
||||
try:
|
||||
time_t = time.strptime(' '.join(src), fmt)
|
||||
except ValueError:
|
||||
time_t = time.strptime(' '.join(src), fmt.replace('%Y', '%y'))
|
||||
return calendar.timegm(time_t)-delta
|
||||
else:
|
||||
return calendar.timegm(time.strptime(src, cls.pubdate_fmt))
|
||||
|
||||
def command_line_options(self):
|
||||
args = []
|
||||
args.append('--max-recursions='+str(self.max_recursions))
|
||||
args.append('--delay='+str(self.delay))
|
||||
args.append('--max-files='+str(self.max_files))
|
||||
for i in self.match_regexps:
|
||||
args.append('--match-regexp="'+i+'"')
|
||||
for i in self.filter_regexps:
|
||||
args.append('--filter-regexp="'+i+'"')
|
||||
return args
|
||||
|
||||
|
||||
class FullContentProfile(DefaultProfile):
|
||||
'''
|
||||
This profile is designed for feeds that embed the full article content in the RSS file.
|
||||
'''
|
||||
|
||||
max_recursions = 0
|
||||
article_counter = 0
|
||||
|
||||
|
||||
def build_index(self):
|
||||
'''Build an RSS based index.html. '''
|
||||
articles = self.parse_feeds(require_url=False)
|
||||
|
||||
def build_sub_index(title, items):
|
||||
ilist = ''
|
||||
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
|
||||
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
|
||||
for item in items:
|
||||
content = item['content']
|
||||
if not content:
|
||||
self.logger.debug('Skipping article as it has no content:%s'%item['title'])
|
||||
continue
|
||||
item['description'] = cutoff(item['description'], self.summary_length)+'…'
|
||||
self.article_counter = self.article_counter + 1
|
||||
url = os.path.join(self.temp_dir, 'article%d.html'%self.article_counter)
|
||||
item['url'] = url
|
||||
open(url, 'wb').write((u'''\
|
||||
<html>
|
||||
<body>
|
||||
<h2>%s</h2>
|
||||
<div>
|
||||
%s
|
||||
</div>
|
||||
</body>
|
||||
</html>'''%(item['title'], content)).encode('utf-8')
|
||||
)
|
||||
ilist += li%item
|
||||
return u'''\
|
||||
<html>
|
||||
<body>
|
||||
<h2>%(title)s</h2>
|
||||
<ul>
|
||||
%(items)s
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
'''%dict(title=title, items=ilist.rstrip())
|
||||
|
||||
cnum = 0
|
||||
clist = ''
|
||||
categories = articles.keys()
|
||||
categories.sort()
|
||||
for category in categories:
|
||||
cnum += 1
|
||||
cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
|
||||
prefix = 'file:' if iswindows else ''
|
||||
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
|
||||
src = build_sub_index(category, articles[category])
|
||||
open(cfile, 'wb').write(src.encode('utf-8'))
|
||||
|
||||
src = '''\
|
||||
<html>
|
||||
<body>
|
||||
<h1>%(title)s</h1>
|
||||
<div style='text-align: right; font-weight: bold'>%(date)s</div>
|
||||
<ul>
|
||||
%(categories)s
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),
|
||||
categories=clist, title=self.title)
|
||||
index = os.path.join(self.temp_dir, 'index.html')
|
||||
open(index, 'wb').write(src.encode('utf-8'))
|
||||
return index
|
||||
|
||||
def cutoff(src, pos, fuzz=50):
|
||||
si = src.find(';', pos)
|
||||
if si > 0 and si-pos > fuzz:
|
||||
si = -1
|
||||
gi = src.find('>', pos)
|
||||
if gi > 0 and gi-pos > fuzz:
|
||||
gi = -1
|
||||
npos = max(si, gi)
|
||||
if npos < 0:
|
||||
npos = pos
|
||||
return src[:npos+1]
|
||||
|
||||
def create_class(src):
|
||||
environment = {'FullContentProfile':FullContentProfile, 'DefaultProfile':DefaultProfile}
|
||||
exec src in environment
|
||||
for item in environment.values():
|
||||
if hasattr(item, 'build_index'):
|
||||
if item.__name__ not in ['DefaultProfile', 'FullContentProfile']:
|
||||
return item
|
||||
|
||||
def cleanup(tdir):
|
||||
try:
|
||||
if os.path.isdir(tdir):
|
||||
shutil.rmtree(tdir)
|
||||
except:
|
||||
pass
|
||||
|
@ -1,38 +0,0 @@
|
||||
import re
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
|
||||
class AssociatedPress(DefaultProfile):
|
||||
|
||||
title = 'Associated Press'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 15
|
||||
html2lrf_options = ['--force-page-break-before-tag="chapter"']
|
||||
|
||||
|
||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
|
||||
(r'<body class="apple-rss-no-unread-mode" onLoad="setup(null)">.*?<!-- start Entries -->', lambda match : '<body>'),
|
||||
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
|
||||
(r'<script.*?>.*?</script>', lambda match : ''),
|
||||
(r'<body.*?>.*?<span class="headline">', lambda match : '<body><span class="headline"><chapter>'),
|
||||
(r'<tr><td><div class="body">.*?<p class="ap-story-p">', lambda match : '<p class="ap-story-p">'),
|
||||
(r'<p class="ap-story-p">', lambda match : '<p>'),
|
||||
(r'Learn more about our <a href="http://apdigitalnews.com/privacy.html">Privacy Policy</a>.*?</body>', lambda match : '</body>'),
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
|
||||
def get_feeds(self):
|
||||
return [ ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'),
|
||||
('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'),
|
||||
('AP World News', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=SCAND&SECTION=HOME'),
|
||||
('AP Political News', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml?SITE=ORMED&SECTION=HOME'),
|
||||
('AP Washington State News', 'http://hosted.ap.org/lineups/WASHINGTONHEADS-rss_2.0.xml?SITE=NYPLA&SECTION=HOME'),
|
||||
('AP Technology News', 'http://hosted.ap.org/lineups/TECHHEADS-rss_2.0.xml?SITE=CTNHR&SECTION=HOME'),
|
||||
('AP Health News', 'http://hosted.ap.org/lineups/HEALTHHEADS-rss_2.0.xml?SITE=FLDAY&SECTION=HOME'),
|
||||
('AP Science News', 'http://hosted.ap.org/lineups/SCIENCEHEADS-rss_2.0.xml?SITE=OHCIN&SECTION=HOME'),
|
||||
('AP Strange News', 'http://hosted.ap.org/lineups/STRANGEHEADS-rss_2.0.xml?SITE=WCNC&SECTION=HOME'),
|
||||
]
|
@ -1,47 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
import re
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class Atlantic(DefaultProfile):
|
||||
|
||||
title = 'The Atlantic'
|
||||
max_recursions = 2
|
||||
INDEX = 'http://www.theatlantic.com/doc/current'
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<body.*?<div id="storytop"', re.DOTALL|re.IGNORECASE),
|
||||
lambda m: '<body><div id="storytop"')
|
||||
]
|
||||
|
||||
def parse_feeds(self):
|
||||
articles = []
|
||||
|
||||
src = self.browser.open(self.INDEX).read()
|
||||
soup = BeautifulSoup(src)
|
||||
|
||||
issue = soup.find('span', attrs={'class':'issue'})
|
||||
if issue:
|
||||
self.timefmt = ' [%s]'%self.tag_to_string(issue).rpartition('|')[-1].strip().replace('/', '-')
|
||||
|
||||
for item in soup.findAll('div', attrs={'class':'item'}):
|
||||
a = item.find('a')
|
||||
if a and a.has_key('href'):
|
||||
url = a['href']
|
||||
url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
|
||||
title = self.tag_to_string(a)
|
||||
byline = item.find(attrs={'class':'byline'})
|
||||
date = self.tag_to_string(byline) if byline else ''
|
||||
description = ''
|
||||
articles.append({
|
||||
'title':title,
|
||||
'date':date,
|
||||
'url':url,
|
||||
'description':description
|
||||
})
|
||||
|
||||
|
||||
return {'Current Issue' : articles }
|
||||
|
||||
|
@ -1,75 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
import os
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre import iswindows
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
|
||||
class AutomaticRSSProfile(DefaultProfile):
|
||||
'''
|
||||
Make downloading of RSS feeds completely automatic. Only input
|
||||
required is the URL of the feed.
|
||||
'''
|
||||
|
||||
max_recursions = 2
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.cindex = 1
|
||||
DefaultProfile.__init__(*args, **kwargs)
|
||||
|
||||
def fetch_content(self, index):
|
||||
raw = open(index, 'rb').read()
|
||||
if self.encoding:
|
||||
raw = raw.decode(self.encoding)
|
||||
enc = self.encoding
|
||||
else:
|
||||
raw, enc = xml_to_unicode(raw)
|
||||
isoup = BeautifulSoup(raw)
|
||||
for a in isoup.findAll('a', href=True):
|
||||
src = a['href']
|
||||
if src.startswith('file:'):
|
||||
src = src[5:]
|
||||
if os.access(src, os.R_OK):
|
||||
self.fetch_content(src)
|
||||
continue
|
||||
try:
|
||||
src = self.browser.open(src).read()
|
||||
except:
|
||||
continue
|
||||
soup = BeautifulSoup(src)
|
||||
header, content = [], []
|
||||
head = soup.find('head')
|
||||
if head is not None:
|
||||
for style in head('style'):
|
||||
header.append(unicode(style))
|
||||
body = soup.find('body')
|
||||
if body is None:
|
||||
continue
|
||||
for tag in body(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
||||
in_table = False
|
||||
c = tag.parent
|
||||
while c is not None:
|
||||
if c.name == 'table':
|
||||
in_table = True
|
||||
break
|
||||
c = c.parent
|
||||
if in_table:
|
||||
continue
|
||||
content.append(unicode(tag))
|
||||
|
||||
cfile = 'content%d.html'%self.cindex
|
||||
self.cindex += 1
|
||||
cfile = os.path.join(os.path.dirname(index), cfile)
|
||||
html = '<html>\n<head>%s</head>\n<body>%s</body></html>'%('\n'.join(header), '\n'.join(content))
|
||||
|
||||
open(cfile, 'wb').write(html.encode(enc))
|
||||
a['href'] = ('file:' if iswindows else '') + cfile
|
||||
open(index, 'wb').write(unicode(isoup).encode(enc))
|
||||
|
||||
def build_index(self):
|
||||
index = DefaultProfile.build_index(self)
|
||||
self.fetch_content(index)
|
||||
|
@ -1,90 +0,0 @@
|
||||
##
|
||||
## web2lrf profile to download articles from Barrons.com
|
||||
## can download subscriber-only content if username and
|
||||
## password are supplied.
|
||||
##
|
||||
'''
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class Barrons(DefaultProfile):
|
||||
|
||||
title = 'Barron\'s'
|
||||
max_recursions = 3
|
||||
max_articles_per_feed = 50
|
||||
needs_subscription = True
|
||||
timefmt = ' [%a, %b %d, %Y]'
|
||||
html_description = True
|
||||
no_stylesheets = False
|
||||
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
|
||||
html2lrf_options = [('--ignore-tables'),('--base-font-size=10')]
|
||||
##delay = 1
|
||||
|
||||
## Don't grab articles more than 7 days old
|
||||
oldest_article = 7
|
||||
|
||||
|
||||
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
## Remove anything before the body of the article.
|
||||
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
|
||||
|
||||
## Remove any insets from the body of the article.
|
||||
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
|
||||
|
||||
## Remove any reprint info from the body of the article.
|
||||
(r'<hr size.*?<p', lambda match : '<p'),
|
||||
|
||||
## Remove anything after the end of the article.
|
||||
(r'<!-- article end.*?</body>', lambda match : '</body>'),
|
||||
]
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = DefaultProfile.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://commerce.barrons.com/auth/login')
|
||||
br.select_form(name='login_form')
|
||||
br['user'] = self.username
|
||||
br['password'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
## Use the print version of a page when available.
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('/article/', '/article_print/')
|
||||
|
||||
## Comment out the feeds you don't want retrieved.
|
||||
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
|
||||
|
||||
def get_feeds(self):
|
||||
return [
|
||||
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
|
||||
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
|
||||
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
|
||||
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
|
||||
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
|
||||
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
|
||||
]
|
||||
|
||||
## Logout of website
|
||||
## NOT CURRENTLY WORKING
|
||||
# def cleanup(self):
|
||||
# try:
|
||||
# self.browser.set_debug_responses(True)
|
||||
# import sys, logging
|
||||
# logger = logging.getLogger("mechanize")
|
||||
# logger.addHandler(logging.StreamHandler(sys.stdout))
|
||||
# logger.setLevel(logging.INFO)
|
||||
|
||||
# res = self.browser.open('http://online.barrons.com/logout')
|
||||
# except:
|
||||
# import traceback
|
||||
# traceback.print_exc()
|
||||
|
||||
|
||||
|
@ -1,45 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Fetch the BBC.
|
||||
'''
|
||||
import re
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class BBC(DefaultProfile):
|
||||
|
||||
title = 'The BBC'
|
||||
max_recursions = 2
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
no_stylesheets = True
|
||||
|
||||
preprocess_regexps = \
|
||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
# Remove footer from individual stories
|
||||
(r'<div class=.footer.>.*?Published',
|
||||
lambda match : '<p></p><div class="footer">Published'),
|
||||
# Add some style info in place of disabled stylesheet
|
||||
(r'<link.*?type=.text/css.*?>', lambda match :
|
||||
'''<style type="text/css">
|
||||
.headline {font-size: x-large;}
|
||||
.fact { padding-top: 10pt }
|
||||
</style>'''),
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
|
||||
|
||||
def get_feeds(self):
|
||||
src = self.browser.open('http://news.bbc.co.uk/1/hi/help/3223484.stm').read()
|
||||
soup = BeautifulSoup(src[src.index('<html'):])
|
||||
feeds = []
|
||||
ul = soup.find('ul', attrs={'class':'rss'})
|
||||
for link in ul.findAll('a'):
|
||||
feeds.append((link.string, link['href']))
|
||||
return feeds
|
||||
|
@ -1,46 +0,0 @@
|
||||
|
||||
import re, time
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class ChristianScienceMonitor(DefaultProfile):
|
||||
|
||||
title = 'Christian Science Monitor'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 20
|
||||
no_stylesheets = True
|
||||
|
||||
|
||||
|
||||
|
||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
(r'<body.*?<div id="story"', lambda match : '<body><div id="story"'),
|
||||
(r'<div class="pubdate">.*?</div>', lambda m: ''),
|
||||
(r'Full HTML version of this story which may include photos, graphics, and related links.*</body>',
|
||||
lambda match : '</body>'),
|
||||
]]
|
||||
|
||||
|
||||
def parse_feeds(self):
|
||||
soup = BeautifulSoup(self.browser.open('http://www.csmonitor.com/textedition'))
|
||||
articles = {}
|
||||
feed = []
|
||||
for tag in soup.findAll(['h2', 'p']):
|
||||
if tag.name == 'h2':
|
||||
title = self.tag_to_string(tag)
|
||||
feed = []
|
||||
articles[title] = feed
|
||||
elif tag.has_key('class') and tag['class'] == 'story':
|
||||
a = tag.find('a')
|
||||
if a is not None and a.has_key('href'):
|
||||
feed.append({
|
||||
'title': self.tag_to_string(a),
|
||||
'url' : 'http://www.csmonitor.com'+a['href'],
|
||||
'date' : time.strftime('%d %b'),
|
||||
'content' : '',
|
||||
})
|
||||
a.extract()
|
||||
feed[-1]['description'] = self.tag_to_string(tag).strip()
|
||||
return articles
|
||||
|
@ -1,51 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Profile to download CNN
|
||||
'''
|
||||
import re
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class CNN(DefaultProfile):
|
||||
|
||||
title = 'CNN'
|
||||
max_recursions = 2
|
||||
timefmt = ' [%d %b %Y]'
|
||||
html_description = True
|
||||
no_stylesheets = True
|
||||
oldest_article = 15
|
||||
|
||||
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [
|
||||
(r'<head>.*?<title', lambda match : '<head><title'),
|
||||
(r'</title>.*?</head>', lambda match : '</title></head>'),
|
||||
(r'<body.*?<\!\-\-Article.*?>', lambda match : ''),
|
||||
(r'<\!\-\-Article End\-\->.*?</body>', lambda match : '</body>'),
|
||||
(r'(</h\d>)<ul>.*?</ul>', lambda match : match.group(1)), # drop story highlights
|
||||
(r'<h2>(.*?)</h2><h1>(.*?)</h1>', lambda match : '<h1>' + match.group(1) + '</h1><h2>' + match.group(2) + '</h2>'), # sports uses h2 for main title and h1 for subtitle (???) switch these around
|
||||
(r'<span class="cnnEmbeddedMosLnk">.*?</span>', lambda match : ''), # drop 'watch more' links
|
||||
(r'(<div class="cnnstorybody">).*?(<p)', lambda match : match.group(1) + match.group(2)), # drop sports photos
|
||||
(r'</?table.*?>|</?tr.*?>|</?td.*?>', lambda match : ''), # drop table formatting
|
||||
(r'<div class="cnnendofstorycontent".*?>.*?</div>', lambda match : ''), # drop extra business links
|
||||
(r'<a href="#TOP">.*?</a>', lambda match : '') # drop business 'to top' link
|
||||
] ]
|
||||
|
||||
def print_version(self, url):
|
||||
return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url
|
||||
|
||||
def get_feeds(self):
|
||||
return [
|
||||
('Top News', 'http://rss.cnn.com/rss/cnn_topstories.rss'),
|
||||
('World', 'http://rss.cnn.com/rss/cnn_world.rss'),
|
||||
('U.S.', 'http://rss.cnn.com/rss/cnn_us.rss'),
|
||||
('Sports', 'http://rss.cnn.com/rss/si_topstories.rss'),
|
||||
('Business', 'http://rss.cnn.com/rss/money_latest.rss'),
|
||||
('Politics', 'http://rss.cnn.com/rss/cnn_allpolitics.rss'),
|
||||
('Law', 'http://rss.cnn.com/rss/cnn_law.rss'),
|
||||
('Technology', 'http://rss.cnn.com/rss/cnn_tech.rss'),
|
||||
('Science & Space', 'http://rss.cnn.com/rss/cnn_space.rss'),
|
||||
('Health', 'http://rss.cnn.com/rss/cnn_health.rss'),
|
||||
('Entertainment', 'http://rss.cnn.com/rss/cnn_showbiz.rss'),
|
||||
('Education', 'http://rss.cnn.com/rss/cnn_education.rss'),
|
||||
('Offbeat', 'http://rss.cnn.com/rss/cnn_offbeat.rss'),
|
||||
('Most Popular', 'http://rss.cnn.com/rss/cnn_mostpopular.rss')
|
||||
]
|
@ -1,73 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Fetch The Economist.
|
||||
'''
|
||||
import re
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class Economist(DefaultProfile):
|
||||
|
||||
title = 'The Economist'
|
||||
timefmt = ' [%d %b %Y]'
|
||||
max_recursions = 2
|
||||
|
||||
|
||||
TITLES = [
|
||||
'The world this week',
|
||||
'Letters',
|
||||
'Briefings',
|
||||
'Special reports',
|
||||
'Britain',
|
||||
'Europe',
|
||||
'United States',
|
||||
'The Americas',
|
||||
'Middle East and Africa',
|
||||
'Asia',
|
||||
'International',
|
||||
'Business',
|
||||
'Finance and economics',
|
||||
'Science and technology',
|
||||
'Books and arts',
|
||||
'Indicators'
|
||||
]
|
||||
|
||||
preprocess_regexps = \
|
||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
# Remove advert
|
||||
(r'<noscript.*?</noscript>', lambda match: ''),
|
||||
(r'<\!--\s+INVISIBLE SKIP .*?-->.*?<\!--\s+INVISIBLE SKIP .*?\s+-->',
|
||||
lambda match : ''),
|
||||
(r'<img.+?alt="AP".+?/>', lambda match: ''),
|
||||
]
|
||||
]
|
||||
|
||||
def __init__(self, logger, verbose=False, username=None, password=None):
|
||||
DefaultProfile.__init__(self, username, password)
|
||||
self.browser = None # Needed as otherwise there are timeouts while fetching actual articles
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '')
|
||||
|
||||
def get_feeds(self):
|
||||
src = self.browser.open('http://economist.com/rss/').read()
|
||||
soup = BeautifulSoup(src)
|
||||
feeds = []
|
||||
for ul in soup.findAll('ul'):
|
||||
lis = ul.findAll('li')
|
||||
try:
|
||||
title, link = lis[0], lis[1]
|
||||
except IndexError:
|
||||
continue
|
||||
title = title.string
|
||||
if title:
|
||||
title = title.strip()
|
||||
if title not in self.__class__.TITLES:
|
||||
continue
|
||||
a = link.find('a')
|
||||
feeds.append((title, a['href'].strip()))
|
||||
|
||||
return feeds
|
@ -1,28 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Profile to download FAZ.net
|
||||
'''
|
||||
import re
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class FazNet(DefaultProfile):
|
||||
|
||||
title = 'FAZ NET'
|
||||
max_recursions = 2
|
||||
html_description = True
|
||||
max_articles_per_feed = 30
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'Zum Thema</span>.*?</BODY>', re.IGNORECASE | re.DOTALL),
|
||||
lambda match : ''),
|
||||
]
|
||||
|
||||
|
||||
def get_feeds(self):
|
||||
return [ ('FAZ.NET', 'http://www.faz.net/s/Rub/Tpl~Epartner~SRss_.xml') ]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('.html?rss_aktuell', '~Afor~Eprint.html')
|
||||
|
@ -1,36 +0,0 @@
|
||||
import re
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class JerusalemPost(DefaultProfile):
|
||||
|
||||
title = 'Jerusalem Post'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 10
|
||||
|
||||
|
||||
|
||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
|
||||
(r'<BODY.*?>.*?<!-- start Entries -->', lambda match : '<BODY><!-- start Entries -->'),
|
||||
(r'<!-- end Entries -->.*?</BODY>', lambda match : '</BODY>'),
|
||||
(r'<script.*?>.*?</script>', lambda match : ''),
|
||||
(r'<div class="apple-rss-article apple-rss-read" onclick=.*?<div class="apple-rss-article-body">', lambda match : ''),
|
||||
(r'<img src=\'/images/logo_NWAnews.gif\' alt=\'NWAnews.com :: Northwest Arkansas\' News Source\'.*?>', lambda match : ''),
|
||||
(r'<img src=\'/images/logo_adg.gif\'.*?>', lambda match : ''),
|
||||
(r'<P CLASS="smallprint">.*?</body>', lambda match : '</body>'),
|
||||
|
||||
]
|
||||
]
|
||||
|
||||
def get_feeds(self):
|
||||
return [ ('Front Page', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333346'),
|
||||
('Israel News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156'),
|
||||
('Middle East News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498'),
|
||||
('International News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144'),
|
||||
('Editorials', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333211'),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return ('http://www.jpost.com/servlet/Satellite?cid=' + url.rpartition('&')[2] + '&pagename=JPost%2FJPArticle%2FPrinter')
|
||||
|
@ -1,44 +0,0 @@
|
||||
'''
|
||||
Profile to download Jutarnji.hr by Valloric
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class Jutarnji(DefaultProfile):
|
||||
|
||||
title = 'Jutarnji'
|
||||
max_recursions = 2
|
||||
timefmt = ' [%d %b %Y]'
|
||||
max_articles_per_feed = 80
|
||||
html_description = True
|
||||
no_stylesheets = True
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<body.*?<span class="vijestnaslov">', re.IGNORECASE | re.DOTALL), lambda match : '<body><span class="vijestnaslov">'),
|
||||
(re.compile(r'</div>.*?</td>', re.IGNORECASE | re.DOTALL), lambda match : '</div></td>'),
|
||||
(re.compile(r'<a name="addComment.*?</body>', re.IGNORECASE | re.DOTALL), lambda match : '</body>'),
|
||||
(re.compile(r'<br>', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
||||
]
|
||||
|
||||
## Getting the print version
|
||||
|
||||
def print_version(self, url):
|
||||
return 'http://www.jutarnji.hr/ispis_clanka.jl?artid=' + url[len(url)-9:len(url)-3]
|
||||
|
||||
|
||||
## Comment out the feeds you don't want retrieved.
|
||||
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
|
||||
## If you want one of these at the top, append a space in front of the name.
|
||||
|
||||
def get_feeds(self):
|
||||
return [
|
||||
(' Naslovnica', 'http://www.jutarnji.hr/rss'),
|
||||
('Sport', 'http://www.jutarnji.hr/sport/rss'),
|
||||
('Novac', 'http://www.jutarnji.hr/novac/rss'),
|
||||
('Kultura i zivot', 'http://www.jutarnji.hr/kultura_i_zivot/rss'),
|
||||
('Automoto', 'http://www.jutarnji.hr/auto_moto/rss'),
|
||||
('Hi-Tech', 'http://www.jutarnji.hr/kultura_i_zivot/hi-tech/rss'),
|
||||
('Dom i nekretnine', 'http://www.jutarnji.hr/nekretnine/rss'),
|
||||
]
|
@ -1,91 +0,0 @@
|
||||
## Copyright (C) 2008 B.Scott Wxby [bswxby] &
|
||||
## Copyright (C) 2007 David Chen SonyReader<at>DaveChen<dot>org
|
||||
##
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## Version 0.3-2008_2_28
|
||||
## Based on WIRED.py by David Chen, 2007, and newsweek.py, bbc.py, nytimes.py by Kovid Goyal
|
||||
## https://calibre.kovidgoyal.net/wiki/UserProfiles
|
||||
##
|
||||
## Usage:
|
||||
## >web2lrf --user-profile nasa.py
|
||||
## Comment out the RSS feeds you don't want in the last section below
|
||||
##
|
||||
## Output:
|
||||
## NASA [YearMonthDate Time].lrf
|
||||
##
|
||||
'''
|
||||
Custom User Profile to download RSS News Feeds and Articles from Wired.com
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class NASA(DefaultProfile):
|
||||
|
||||
title = 'NASA'
|
||||
max_recursions = 2
|
||||
timefmt = ' [%Y%b%d %H%M]'
|
||||
html_description = True
|
||||
no_stylesheets = True
|
||||
|
||||
## Don't grab articles more than 30 days old
|
||||
oldest_article = 30
|
||||
|
||||
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
## Fix the encoding to UTF-8
|
||||
(r'<meta http-equiv="Content-Type" content="text/html; charset=(\S+)"', lambda match : match.group().replace(match.group(1), 'UTF-8')),
|
||||
|
||||
## Remove any banners/links/ads/cruft before the body of the article.
|
||||
(r'<body.*?((<div id="article_body">)|(<div id="st-page-maincontent">)|(<div id="containermain">)|(<p class="ap-story-p">)|(<!-- img_nav -->))', lambda match: '<body><div>'),
|
||||
|
||||
## Remove any links/ads/comments/cruft from the end of the body of the article.
|
||||
(r'((<!-- end article content -->)|(<div id="st-custom-afterpagecontent">)|(<p class="ap-story-p">©)|(<div class="entry-footer">)|(<div id="see_also">)|(<p>Via <a href=)|(<div id="ss_nav">)).*?</html>', lambda match : '</div></body></html>'),
|
||||
|
||||
## Correctly embed in-line images by removing the surrounding javascript that will be ignored in the conversion
|
||||
(r'<a.*?onclick.*?>.*?(<img .*?>)', lambda match: match.group(1),),
|
||||
|
||||
## This removes header and footer information from each print version.
|
||||
(re.compile(r'<!-- Top Header starts -->.*?<!-- Body starts -->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
|
||||
(re.compile(r'<hr align="center" width="200"><p align="center">.*?<!-- Press Release standard text ends -->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
|
||||
(re.compile(r'<!-- Top Header starts -->.*?<!---->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
|
||||
|
||||
## This removes the "download image" of various sizes from the Image of the day.
|
||||
(re.compile(r'(?is)<div id="download_image_box_print">.*?<div id="caption_region_print">'), lambda match : '<New Stuff>'),
|
||||
|
||||
|
||||
]
|
||||
]
|
||||
|
||||
## NASA's print pages differ only by the ending "_prt.htm", so I've replaced them below.
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('.html', '_prt.htm')
|
||||
|
||||
## Comment out the feeds you don't want retrieved.
|
||||
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
|
||||
## If you want one of these at the top, append a space in front of the name.
|
||||
|
||||
|
||||
def get_feeds(self):
|
||||
return [
|
||||
(' Breaking News', 'http://www.nasa.gov/rss/breaking_news.rss'),
|
||||
('Image of the Day', 'http://www.nasa.gov/rss/image_of_the_day.rss'),
|
||||
('Moon and Mars Exploration', 'http://www.nasa.gov/rss/moon_mars.rss'),
|
||||
('Shuttle and Station News', 'http://www.nasa.gov/rss/shuttle_station.rss'),
|
||||
('Solar System News', 'http://www.nasa.gov/rss/solar_system.rss'),
|
||||
('Universe News', 'http://www.nasa.gov/rss/universe.rss'),
|
||||
('Earth News', 'http://www.nasa.gov/rss/earth.rss'),
|
||||
('Aeronautics News', 'http://www.nasa.gov/rss/aeronautics.rss'),
|
||||
]
|
||||
|
@ -1,37 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Profile to download Newsweek
|
||||
'''
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class Newsweek(DefaultProfile):
|
||||
|
||||
title = 'Newsweek'
|
||||
max_recursions = 2
|
||||
timefmt = ' [%d %b %Y]'
|
||||
html_description = True
|
||||
oldest_article = 15
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
if not url.endswith('/'):
|
||||
url += '/'
|
||||
return url + 'output/print'
|
||||
|
||||
def get_feeds(self):
|
||||
return [
|
||||
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
|
||||
('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
|
||||
('Politics', 'http://feeds.newsweek.com/headlines/politics'),
|
||||
('Health', 'http://feeds.newsweek.com/headlines/health'),
|
||||
('Business', 'http://feeds.newsweek.com/headlines/business'),
|
||||
('Science and Technology', 'http://feeds.newsweek.com/headlines/technology/science'),
|
||||
('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
|
||||
('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
|
||||
('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
|
||||
('Society', 'http://feeds.newsweek.com/newsweek/society'),
|
||||
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
|
||||
]
|
||||
|
||||
|
@ -1,56 +0,0 @@
|
||||
'''
|
||||
Profile to download Jutarnji.hr
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class NewYorker(DefaultProfile):
|
||||
|
||||
title = 'The New Yorker'
|
||||
max_recursions = 2
|
||||
timefmt = ' [%d %b %Y]'
|
||||
max_articles_per_feed = 20
|
||||
html_description = True
|
||||
no_stylesheets = True
|
||||
oldest_article = 14
|
||||
|
||||
|
||||
## Getting the print version
|
||||
def print_version(self, url):
|
||||
return url + '?printable=true'
|
||||
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<body.*?<!-- start article content -->', re.IGNORECASE | re.DOTALL), lambda match : '<body>'),
|
||||
(re.compile(r'<div class="utils"'),
|
||||
lambda match : '<div class="utils" style="display:none"'),
|
||||
(re.compile(r'<div class="articleRailLinks"'),
|
||||
lambda match : '<div class="articleRailLinks" style="display:none"'),
|
||||
(re.compile(r'<div id="keywords"'),
|
||||
lambda match : '<div id="keywords" style="display:none"'),
|
||||
(re.compile(r'<!-- end article body -->.*?</body>', re.IGNORECASE | re.DOTALL), lambda match : '</body>'),
|
||||
(re.compile(r'<!-- start video content -->.*?<!-- end video content -->', re.IGNORECASE | re.DOTALL), lambda match : '<!-- start video content --><!-- end video content -->'),
|
||||
]
|
||||
|
||||
|
||||
## Comment out the feeds you don't want retrieved.
|
||||
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
|
||||
## If you want one of these at the top, append a space in front of the name.
|
||||
|
||||
def get_feeds(self):
|
||||
return [
|
||||
('Online Only', 'http://feeds.newyorker.com/services/rss/feeds/online.xml'),
|
||||
('The Talk Of The Town', 'http://feeds.newyorker.com/services/rss/feeds/talk.xml'),
|
||||
('Reporting and Essays', 'http://feeds.newyorker.com/services/rss/feeds/reporting.xml'),
|
||||
('Arts and Culture', 'http://feeds.newyorker.com/services/rss/feeds/arts.xml'),
|
||||
('Humor', 'http://feeds.newyorker.com/services/rss/feeds/humor.xml'),
|
||||
('Fiction and Poetry', 'http://feeds.newyorker.com/services/rss/feeds/fiction.xml'),
|
||||
('Comment', 'http://feeds.newyorker.com/services/rss/feeds/comment.xml'),
|
||||
('The Financial Page', 'http://feeds.newyorker.com/services/rss/feeds/financial.xml'),
|
||||
('Politics', 'http://feeds.newyorker.com/services/rss/feeds/politics.xml'),
|
||||
('Movies', 'http://feeds.newyorker.com/services/rss/feeds/movies.xml'),
|
||||
('Books', 'http://feeds.newyorker.com/services/rss/feeds/books.xml'),
|
||||
('Tables For Two', 'http://feeds.newyorker.com/services/rss/feeds/tables.xml'),
|
||||
]
|
@ -1,24 +0,0 @@
|
||||
## By Lorenzo goehr, lorenzogoehr@hotmail.com for Libprs500 by Kovid Goyal
|
||||
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
import re
|
||||
|
||||
class NewYorkReviewOfBooks(DefaultProfile):
|
||||
|
||||
title = 'New York Review of Books'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 50
|
||||
html_description = True
|
||||
no_stylesheets = True
|
||||
|
||||
def get_feeds(self):
|
||||
return [ ('Current Issue', 'http://feeds.feedburner.com/nybooks') ]
|
||||
|
||||
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [
|
||||
(r'<meta http-equiv="Content-Type" content="text/html; charset=(\S+)"', lambda match : match.group().replace(match.group(1), 'UTF-8')),
|
||||
(r'<body.*?((<div id="article_body">)|(<div id="st-page-maincontent">)|(<div id="containermain">)|(<p class="ap-story-p">)|(<!-- img_nav -->))', lambda match: '<body><div>'),
|
||||
(r'((<!-- end article content -->)|(<div id="st-custom-afterpagecontent">)|(<p class="ap-story-p">©)|(<div class="entry-footer">)|(<div id="see_also">)|(<p>Via <a href=)|(<div id="ss_nav">)).*?</html>', lambda match : '</div></body></html>'),
|
||||
(r'<div class="nav">.*?<h2>', lambda match: '<h2>'),
|
||||
(r'<table.*?>.*?(<img .*?/table>)', lambda match: match.group(1),), ] ]
|
@ -1,100 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Profile to download the New York Times
|
||||
'''
|
||||
import re, time
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class NYTimes(DefaultProfile):
|
||||
|
||||
title = 'The New York Times'
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
needs_subscription = True
|
||||
max_recursions = 2
|
||||
recommended_frequency = 1
|
||||
encoding = 'cp1252'
|
||||
html2lrf_options = ['--base-font-size=0']
|
||||
|
||||
|
||||
preprocess_regexps = \
|
||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
# Remove header bar
|
||||
(r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'),
|
||||
(r'<div class="articleTools">.*></ul>', lambda match : ''),
|
||||
# Remove footer bar
|
||||
(r'<\!-- end \#article -->.*', lambda match : '</body></html>'),
|
||||
(r'<div id="footer">.*', lambda match : '</body></html>'),
|
||||
]
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = DefaultProfile.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.nytimes.com/auth/login')
|
||||
br.select_form(name='login')
|
||||
br['USERID'] = self.username
|
||||
br['PASSWORD'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def get_feeds(self):
|
||||
src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read()
|
||||
soup = BeautifulSoup(src[src.index('<html'):])
|
||||
feeds = []
|
||||
for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
|
||||
if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',
|
||||
'Dining & Wine', 'Home & Garden', 'Multimedia',
|
||||
'Most E-mailed Articles',
|
||||
'Automobiles', 'Fashion & Style', 'Television News',
|
||||
'Education']:
|
||||
feeds.append((link['title'], link['href'].replace('graphics8', 'www')))
|
||||
|
||||
return feeds
|
||||
|
||||
|
||||
def parse_feeds(self):
|
||||
if self.lrf: # The new feed causes the SONY Reader to crash
|
||||
return DefaultProfile.parse_feeds(self)
|
||||
src = self.browser.open('http://www.nytimes.com/pages/todayspaper/index.html').read().decode('cp1252')
|
||||
soup = BeautifulSoup(src)
|
||||
|
||||
def feed_title(div):
|
||||
return ''.join(div.findAll(text=True, recursive=False)).strip()
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline']}):
|
||||
|
||||
if div['class'] == 'section-headline':
|
||||
key = feed_title(div)
|
||||
articles[key] = []
|
||||
|
||||
elif div['class'] in ['story', 'story headline']:
|
||||
a = div.find('a', href=True)
|
||||
if not a:
|
||||
continue
|
||||
url = self.print_version(a['href'])
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
pubdate = time.strftime('%a, %d %b', time.localtime())
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
|
||||
feed = key if key is not None else 'Uncategorized'
|
||||
if not articles.has_key(feed):
|
||||
articles[feed] = []
|
||||
articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate, description=description,
|
||||
content=''))
|
||||
|
||||
|
||||
return articles
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?&pagewanted=print'
|
@ -1,40 +0,0 @@
|
||||
##
|
||||
## web2lrf profile to download articles from Portfolio.com
|
||||
##
|
||||
'''
|
||||
'''
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import FullContentProfile
|
||||
|
||||
class Portfolio(FullContentProfile):
|
||||
|
||||
title = 'Portfolio'
|
||||
max_articles_per_feed = 50
|
||||
timefmt = ' [%a, %b %d, %Y]'
|
||||
html_description = True
|
||||
no_stylesheets = True
|
||||
html2lrf_options = ['--ignore-tables']
|
||||
##delay = 1
|
||||
|
||||
oldest_article = 30
|
||||
|
||||
## Comment out the feeds you don't want retrieved.
|
||||
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire
|
||||
def get_feeds(self):
|
||||
return [
|
||||
('Business Travel', 'http://feeds.portfolio.com/portfolio/businesstravel'),
|
||||
('Careers', 'http://feeds.portfolio.com/portfolio/careers'),
|
||||
('Culture and Lifestyle', 'http://feeds.portfolio.com/portfolio/cultureandlifestyle'),
|
||||
('Executives','http://feeds.portfolio.com/portfolio/executives'),
|
||||
('News and Markets', 'http://feeds.portfolio.com/portfolio/news'),
|
||||
('Business Spin', 'http://feeds.portfolio.com/portfolio/businessspin'),
|
||||
('Capital', 'http://feeds.portfolio.com/portfolio/capital'),
|
||||
('Daily Brief', 'http://feeds.portfolio.com/portfolio/dailybrief'),
|
||||
('Market Movers', 'http://feeds.portfolio.com/portfolio/marketmovers'),
|
||||
('Mixed Media', 'http://feeds.portfolio.com/portfolio/mixedmedia'),
|
||||
('Odd Numbers', 'http://feeds.portfolio.com/portfolio/oddnumbers'),
|
||||
('Playbook', 'http://feeds.portfolio.com/portfolio/playbook'),
|
||||
('Tech Observer', 'http://feeds.portfolio.com/portfolio/thetechobserver'),
|
||||
('World According to ...', 'http://feeds.portfolio.com/portfolio/theworldaccordingto'),
|
||||
]
|
||||
|
@ -1,39 +0,0 @@
|
||||
import re
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
|
||||
class Reuters(DefaultProfile):
|
||||
|
||||
title = 'Reuters'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 10
|
||||
html_description = True
|
||||
|
||||
|
||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
##(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
|
||||
(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
|
||||
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
|
||||
(r'<script.*?>.*?</script>', lambda match : ''),
|
||||
(r'<body>.*?<div class="contentBand">', lambda match : '<body>'),
|
||||
(r'<h3>Share:</h3>.*?</body>', lambda match : '<!-- END:: Shared Module id=36615 --></body>'),
|
||||
(r'<div id="atools" class="articleTools">.*?<div class="linebreak">', lambda match : '<div class="linebreak">'),
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
|
||||
def get_feeds(self):
|
||||
return [ ('Top Stories', 'http://feeds.reuters.com/reuters/topNews?format=xml'),
|
||||
('US News', 'http://feeds.reuters.com/reuters/domesticNews?format=xml'),
|
||||
('World News', 'http://feeds.reuters.com/reuters/worldNews?format=xml'),
|
||||
('Politics News', 'http://feeds.reuters.com/reuters/politicsNews?format=xml'),
|
||||
('Science News', 'http://feeds.reuters.com/reuters/scienceNews?format=xml'),
|
||||
('Environment News', 'http://feeds.reuters.com/reuters/Environment?format=xml'),
|
||||
('Technology News', 'http://feeds.reuters.com/reuters/technologyNews?format=xml'),
|
||||
('Oddly Enough News', 'http://feeds.reuters.com/reuters/oddlyEnoughNews?format=xml')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return ('http://www.reuters.com/article/id' + url + '?sp=true')
|
@ -1,36 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
'''
|
||||
Fetch Spiegel Online.
|
||||
'''
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
import re
|
||||
|
||||
class SpiegelOnline(DefaultProfile):
|
||||
|
||||
title = 'Spiegel Online'
|
||||
timefmt = ' [ %Y-%m-%d %a]'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 40
|
||||
use_pubdate = False
|
||||
no_stylesheets = True
|
||||
|
||||
preprocess_regexps = \
|
||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
# Remove Zum Thema footer
|
||||
(r'<div class="spArticleCredit.*?</body>', lambda match: '</body>'),
|
||||
]
|
||||
]
|
||||
|
||||
def get_feeds(self):
|
||||
return [ ('Spiegel Online', 'http://www.spiegel.de/schlagzeilen/rss/0,5291,,00.xml') ]
|
||||
|
||||
|
||||
def print_version(self,url):
|
||||
tokens = url.split(',')
|
||||
tokens[-2:-2] = ['druck|']
|
||||
return ','.join(tokens).replace('|,','-')
|
@ -1,36 +0,0 @@
|
||||
import re
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
|
||||
class UnitedPressInternational(DefaultProfile):
|
||||
|
||||
title = 'United Press International'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 15
|
||||
html2lrf_options = ['--override-css= "H1 {font-family: Arial; font-weight: bold; color: #000000; size: 10pt;}"']
|
||||
|
||||
|
||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
|
||||
(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
|
||||
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
|
||||
(r'<script.*?>.*?</script>', lambda match : ''),
|
||||
(r'<body onload=.*?>.*?<a href="http://www.upi.com">', lambda match : '<body style="font: 8pt arial;">'),
|
||||
##(r'<div class=\'headerDIV\'><h2><a style="color: #990000;" href="http://www.upi.com/NewsTrack/Top_News/">Top News</a></h2></div>.*?<br clear="all">', lambda match : ''),
|
||||
(r'<script src="http://www.g.*?>.*?</body>', lambda match : ''),
|
||||
(r'<span style="font: 16pt arial', lambda match : '<span style="font: 12pt arial'),
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
|
||||
def get_feeds(self):
|
||||
return [ ('Top Stories', 'http://www.upi.com/rss/NewsTrack/Top_News/'),
|
||||
('Science', 'http://www.upi.com/rss/NewsTrack/Science/'),
|
||||
('Heatlth', 'http://www.upi.com/rss/NewsTrack/Health/'),
|
||||
('Quirks', 'http://www.upi.com/rss/NewsTrack/Quirks/'),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return (url + 'print_view/')
|
@ -1,43 +0,0 @@
|
||||
'''
|
||||
Profile to download Jutarnji.hr by Valloric
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class USAToday(DefaultProfile):
|
||||
|
||||
title = 'USA Today'
|
||||
max_recursions = 2
|
||||
timefmt = ' [%d %b %Y]'
|
||||
max_articles_per_feed = 20
|
||||
html_description = True
|
||||
#no_stylesheets = True
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<BODY.*?<!--Article Goes Here-->', re.IGNORECASE | re.DOTALL), lambda match : '<BODY>'),
|
||||
(re.compile(r'<!--Article End-->.*?</BODY>', re.IGNORECASE | re.DOTALL), lambda match : '</BODY>'),
|
||||
]
|
||||
|
||||
## Getting the print version
|
||||
|
||||
def print_version(self, url):
|
||||
return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url
|
||||
|
||||
|
||||
## Comment out the feeds you don't want retrieved.
|
||||
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
|
||||
## If you want one of these at the top, append a space in front of the name.
|
||||
|
||||
def get_feeds(self):
|
||||
return [
|
||||
(' Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
|
||||
('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
|
||||
('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
|
||||
('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'),
|
||||
('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'),
|
||||
('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'),
|
||||
('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
|
||||
('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
|
||||
]
|
@ -1,44 +0,0 @@
|
||||
import re
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
|
||||
class WashingtonPost(DefaultProfile):
|
||||
|
||||
title = 'Washington Post'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 20
|
||||
use_pubdate = False
|
||||
|
||||
|
||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
|
||||
(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
|
||||
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
|
||||
(r'<script.*?>.*?</script>', lambda match : ''),
|
||||
(r'<body.*?>.*?.correction {', lambda match : '<body><style>.correction {'),
|
||||
(r'<span class="display:none;" name="pubDate".*?>.*?</body>', lambda match : '<body>'),
|
||||
|
||||
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
|
||||
def get_feeds(self):
|
||||
return [ ('Today\'s Highlights', 'http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/03/24/LI2005032400102.xml'),
|
||||
('Politics', 'http://www.washingtonpost.com/wp-dyn/rss/politics/index.xml'),
|
||||
('Nation', 'http://www.www.washingtonpost.com/wp-dyn/rss/nation/index.xml'),
|
||||
('World', 'http://www.washingtonpost.com/wp-dyn/rss/world/index.xml'),
|
||||
('Business', 'http://www.washingtonpost.com/wp-dyn/rss/business/index.xml'),
|
||||
('Technology', 'http://www.washingtonpost.com/wp-dyn/rss/technology/index.xml'),
|
||||
('Health', 'http://www.washingtonpost.com/wp-dyn/rss/health/index.xml'),
|
||||
('Education', 'http://www.washingtonpost.com/wp-dyn/rss/education/index.xml'),
|
||||
('Editorials', 'http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/05/30/LI2005053000331.xml'),
|
||||
]
|
||||
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return (url.rpartition('.')[0] + '_pf.html')
|
||||
|
@ -1,108 +0,0 @@
|
||||
##
|
||||
## web2lrf profile to download articles from WSJ.com
|
||||
## can download subscriber-only content if username and
|
||||
## password are supplied.
|
||||
##
|
||||
'''
|
||||
'''
|
||||
|
||||
import re
|
||||
from urlparse import urlparse
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class WallStreetJournal(DefaultProfile):
|
||||
|
||||
title = 'The Wall Street Journal'
|
||||
max_recursions = 2
|
||||
needs_subscription = True
|
||||
no_stylesheets = False
|
||||
max_articles_per_feed = 10
|
||||
timefmt = ' [%a, %b %d, %Y]'
|
||||
html2lrf_options = ['--ignore-tables']
|
||||
|
||||
## Don't grab articles more than 7 days old
|
||||
oldest_article = 7
|
||||
|
||||
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
## Remove anything before the body of the article.
|
||||
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
|
||||
|
||||
## Remove anything after the end of the article.
|
||||
(r'<!-- article end.*?</body>', lambda match : '</body>'),
|
||||
]
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = DefaultProfile.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://online.wsj.com/login')
|
||||
br.select_form(name='login_form')
|
||||
br['user'] = self.username
|
||||
br['password'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def print_version(self, url):
|
||||
article = urlparse(url).path.rpartition('/')[-1]
|
||||
return 'http://online.wsj.com/article_print/'+article
|
||||
|
||||
## Comment out the feeds you don't want retrieved.
|
||||
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire
|
||||
def get_feeds(self):
|
||||
return [
|
||||
#('Most Emailed - Day', 'http://online.wsj.com/xml/rss/3_7030.xml'),
|
||||
#('Most Emailed - Week', 'http://online.wsj.com/xml/rss/3_7253.xml'),
|
||||
#('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'),
|
||||
(' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'),
|
||||
(' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'),
|
||||
# ('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'),
|
||||
(' Today\'s Newspaper - Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'),
|
||||
(' Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'),
|
||||
(' Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'),
|
||||
(' Today\'s Newspaper - Personal Journal', 'http://online.wsj.com/xml/rss/3_7208.xml'),
|
||||
(' Today\'s Newspaper - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7209.xml'),
|
||||
('Opinion', 'http://online.wsj.com/xml/rss/3_7041.xml'),
|
||||
('News - U.S.: What\'s News', 'http://online.wsj.com/xml/rss/3_7011.xml'),
|
||||
('News - U.S. Business', 'http://online.wsj.com/xml/rss/3_7014.xml'),
|
||||
('News - Europe: What\'s News', 'http://online.wsj.com/xml/rss/3_7012.xml'),
|
||||
('News - Asia: What\'s News', 'http://online.wsj.com/xml/rss/3_7013.xml'),
|
||||
('News - World News', 'http://online.wsj.com/xml/rss/3_7085.xml'),
|
||||
('News - Economy', 'http://online.wsj.com/xml/rss/3_7086.xml'),
|
||||
('News - Earnings', 'http://online.wsj.com/xml/rss/3_7088.xml'),
|
||||
('News - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
|
||||
('News - Law', 'http://online.wsj.com/xml/rss/3_7091.xml'),
|
||||
('News - Media & Marketing', 'http://online.wsj.com/xml/rss/3_7020.xml'),
|
||||
('Technology - What\'s News', 'http://online.wsj.com/xml/rss/3_7015.xml'),
|
||||
('Technology - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
|
||||
('Technology - Telecommunications', 'http://online.wsj.com/xml/rss/3_7095.xml'),
|
||||
('Technology - E-commerce/Media', 'http://online.wsj.com/xml/rss/3_7096.xml'),
|
||||
('Technology - Asia', 'http://online.wsj.com/xml/rss/3_7097.xml'),
|
||||
('Technology - Europe', 'http://online.wsj.com/xml/rss/3_7098.xml'),
|
||||
('Markets - News', 'http://online.wsj.com/xml/rss/3_7031.xml'),
|
||||
('Markets - Europe News', 'http://online.wsj.com/xml/rss/3_7101.xml'),
|
||||
('Markets - Asia News', 'http://online.wsj.com/xml/rss/3_7102.xml'),
|
||||
('Markets - Deals & Deal Makers', 'http://online.wsj.com/xml/rss/3_7099.xml'),
|
||||
('Markets - Hedge Funds', 'http://online.wsj.com/xml/rss/3_7199.xml'),
|
||||
('Personal Journal', 'http://online.wsj.com/xml/rss/3_7200.xml'),
|
||||
('Personal Journal - Money', 'http://online.wsj.com/xml/rss/3_7104.xml'),
|
||||
('Personal Journal - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
|
||||
('Personal Journal - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
|
||||
('Personal Journal - Homes', 'http://online.wsj.com/xml/rss/3_7105.xml'),
|
||||
('Personal Journal - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
|
||||
('Personal Journal - Careers', 'http://online.wsj.com/xml/rss/3_7107.xml'),
|
||||
# ('Personal Journal - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
|
||||
('Weekend & Leisure', 'http://online.wsj.com/xml/rss/3_7201.xml'),
|
||||
('Weekend & Leisure - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7202.xml'),
|
||||
('Weekend & Leisure - Arts & Entertainment', 'http://online.wsj.com/xml/rss/3_7177.xml'),
|
||||
('Weekend & Leisure - Books', 'http://online.wsj.com/xml/rss/3_7203.xml'),
|
||||
# ('Weekend & Leisure - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
|
||||
# ('Weekend & Leisure - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
|
||||
('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'),
|
||||
]
|
||||
|
||||
## Logout of website
|
||||
## NOT CURRENTLY WORKING
|
||||
# def cleanup(self):
|
||||
# self.browser.open('http://commerce.wsj.com/auth/postlogout')
|
@ -1,26 +0,0 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
'''
|
||||
Fetch Die Zeit.
|
||||
'''
|
||||
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class ZeitNachrichten(DefaultProfile):
|
||||
|
||||
title = 'Die Zeit Nachrichten'
|
||||
timefmt = ' [%d %b %Y]'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 40
|
||||
html_description = True
|
||||
no_stylesheets = True
|
||||
encoding = 'latin1'
|
||||
|
||||
|
||||
def get_feeds(self):
|
||||
return [ ('Zeit.de', 'http://newsfeed.zeit.de/news/index') ]
|
||||
|
||||
def print_version(self,url):
|
||||
return url.replace('http://www.zeit.de/', 'http://images.zeit.de/text/').replace('?from=rss', '')
|
||||
|
@ -29,6 +29,10 @@ class MOBIOutput(OutputFormatPlugin):
|
||||
),
|
||||
])
|
||||
|
||||
recommendations = set([
|
||||
('dont_justify', True, OptionRecommendation.HIGH),
|
||||
])
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
from calibre.ebooks.mobi.writer import PALM_MAX_IMAGE_SIZE, MobiWriter
|
||||
|
@ -22,7 +22,6 @@ entry_points = {
|
||||
'web2disk = calibre.web.fetch.simple:main',
|
||||
'feeds2disk = calibre.web.feeds.main:main',
|
||||
'calibre-server = calibre.library.server:main',
|
||||
'web2lrf = calibre.ebooks.lrf.web.convert_from:main',
|
||||
'lrf2lrs = calibre.ebooks.lrf.lrfparser:main',
|
||||
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',
|
||||
'isbndb = calibre.ebooks.metadata.isbndb:main',
|
||||
|
@ -19,7 +19,7 @@ from calibre import browser, __appname__, iswindows, \
|
||||
strftime, __version__, preferred_encoding
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.lrf import entity_to_unicode
|
||||
from calibre import entity_to_unicode
|
||||
from calibre.web import Recipe
|
||||
from calibre.ebooks import render_html
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
|
@ -46,13 +46,12 @@ recipe_modules = ['recipe_' + r for r in (
|
||||
|
||||
import re, imp, inspect, time, os
|
||||
from calibre.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe, AutomaticNewsRecipe
|
||||
from calibre.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.path import path
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from calibre import __appname__, english_sort
|
||||
|
||||
basic_recipes = (BasicNewsRecipe, AutomaticNewsRecipe, CustomIndexRecipe, DefaultProfile, FullContentProfile)
|
||||
basic_recipes = (BasicNewsRecipe, AutomaticNewsRecipe, CustomIndexRecipe)
|
||||
basic_recipe_names = (i.__name__ for i in basic_recipes)
|
||||
|
||||
|
||||
@ -83,7 +82,7 @@ def compile_recipe(src):
|
||||
Compile the code in src and return the first object that is a recipe or profile.
|
||||
@param src: Python source code
|
||||
@type src: string
|
||||
@return: Recipe/Profile class or None, if no such class was found in C{src}
|
||||
@return: Recipe class or None, if no such class was found in C{src}
|
||||
'''
|
||||
global _tdir, _crep
|
||||
if _tdir is None or not os.path.exists(_tdir):
|
||||
@ -97,7 +96,6 @@ def compile_recipe(src):
|
||||
src = re.sub(r'from __future__.*', '', src)
|
||||
f = open(temp, 'wb')
|
||||
src = 'from %s.web.feeds.news import BasicNewsRecipe, AutomaticNewsRecipe\n'%__appname__ + src
|
||||
src = 'from %s.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile\n'%__appname__ + src
|
||||
src = '# coding: utf-8\n' + src
|
||||
src = 'from __future__ import with_statement\n' + src
|
||||
|
||||
@ -108,7 +106,7 @@ def compile_recipe(src):
|
||||
module = imp.load_module(temp.namebase, *module)
|
||||
classes = inspect.getmembers(module,
|
||||
lambda x : inspect.isclass(x) and \
|
||||
issubclass(x, (DefaultProfile, BasicNewsRecipe)) and \
|
||||
issubclass(x, (BasicNewsRecipe,)) and \
|
||||
x not in basic_recipes)
|
||||
if not classes:
|
||||
return None
|
||||
@ -119,11 +117,10 @@ def compile_recipe(src):
|
||||
def get_builtin_recipe(title):
|
||||
'''
|
||||
Return a builtin recipe/profile class whose title == C{title} or None if no such
|
||||
recipe exists. Also returns a flag that is True iff the found recipe is really
|
||||
an old-style Profile.
|
||||
recipe exists.
|
||||
|
||||
@type title: string
|
||||
@rtype: class or None, boolean
|
||||
@rtype: class or None
|
||||
'''
|
||||
for r in recipes:
|
||||
if r.title == title:
|
||||
|
Loading…
x
Reference in New Issue
Block a user