LRF Output

This commit is contained in:
Kovid Goyal 2009-05-02 17:34:19 -07:00
parent 3a99f99104
commit 538d310bb8
54 changed files with 490 additions and 3853 deletions

View File

@ -291,6 +291,7 @@ from calibre.web.feeds.input import RecipeInput
from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.epub.output import EPUBOutput
from calibre.ebooks.mobi.output import MOBIOutput
from calibre.ebooks.lrf.output import LRFOutput
from calibre.ebooks.txt.output import TXTOutput
from calibre.ebooks.pdf.output import PDFOutput
from calibre.ebooks.pml.input import PMLInput
@ -310,7 +311,7 @@ from calibre.devices.jetbook.driver import JETBOOK
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput,
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput,
FB2Input, ODTInput, RTFInput, EPUBOutput, RecipeInput, PMLInput,
PMLOutput, MOBIOutput]
PMLOutput, MOBIOutput, LRFOutput]
plugins += [PRS500, PRS505, PRS700, CYBOOKG3, KINDLE, KINDLE2, BLACKBERRY,
EB600, JETBOOK]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \

View File

@ -236,7 +236,6 @@ OptionRecommendation(name='page_breaks_before',
'before the specified elements.')
),
OptionRecommendation(name='margin_top',
recommended_value=5.0, level=OptionRecommendation.LOW,
help=_('Set the top margin in pts. Default is %default. '
@ -614,11 +613,18 @@ OptionRecommendation(name='list_recipes',
if self.opts.extra_css and os.path.exists(self.opts.extra_css):
self.opts.extra_css = open(self.opts.extra_css, 'rb').read()
oibl = self.opts.insert_blank_line
orps = self.opts.remove_paragraph_spacing
if self.output_plugin.file_type == 'lrf':
self.opts.insert_blank_line = False
self.opts.remove_paragraph_spacing = False
flattener = CSSFlattener(fbase=fbase, fkey=fkey,
lineh=self.opts.line_height,
untable=self.output_plugin.file_type in ('mobi','lit'),
unfloat=self.output_plugin.file_type in ('mobi', 'lit'))
flattener(self.oeb, self.opts)
self.opts.insert_blank_line = oibl
self.opts.remove_paragraph_spacing = orps
if self.opts.linearize_tables and \
self.output_plugin.file_type not in ('mobi', 'lrf'):

View File

@ -1,43 +1,19 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
"""
This package contains logic to read and write LRF files.
The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}.
"""
import sys, os
from optparse import OptionValueError
from htmlentitydefs import name2codepoint
This package contains logic to read and write LRF files.
The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}.
"""
from uuid import uuid4
from calibre.ebooks.lrf.pylrs.pylrs import Book as _Book
from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, PutObj, \
Paragraph, TextStyle, BlockStyle
from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, \
TextStyle, BlockStyle
from calibre.ebooks.lrf.fonts import FONT_FILE_MAP
from calibre.ebooks import ConversionError
from calibre import __appname__, __version__, __author__, iswindows
from calibre.utils.config import OptionParser
__docformat__ = "epytext"
preferred_source_formats = [
'LIT',
'MOBI',
'EPUB',
'ODT',
'HTML',
'HTM',
'XHTM',
'XHTML',
'PRC',
'AZW',
'FB2',
'RTF',
'PDF',
'TXT',
'ZIP',
'RAR'
]
class LRFParseError(Exception):
pass
@ -55,174 +31,8 @@ class PRS500_PROFILE(object):
header_height = 30 #: In px
default_fonts = { 'sans': "Swis721 BT Roman", 'mono': "Courier10 BT Roman",
'serif': "Dutch801 Rm BT Roman"}
name = 'prs500'
profile_map = {
PRS500_PROFILE.name : PRS500_PROFILE,
}
def profile_from_string(option, opt_str, value, parser):
try:
profile = profile_map[value]
setattr(parser.values, option.dest, profile)
except KeyError:
raise OptionValueError('Profile: '+value+' is not implemented. Implemented profiles: %s'%(profile_map.keys()))
def option_parser(usage, gui_mode=False):
parser = OptionParser(usage=usage, gui_mode=gui_mode)
metadata = parser.add_option_group('METADATA OPTIONS')
metadata.add_option("-t", "--title", action="store", type="string", default=None,\
dest="title", help=_("Set the title. Default: filename."))
metadata.add_option("-a", "--author", action="store", type="string", \
dest="author", help=_("Set the author(s). Multiple authors should be set as a comma separated list. Default: %default"),
default=_('Unknown'))
metadata.add_option("--comment", action="store", type="string", \
dest="freetext", help=_("Set the comment."), default=_('Unknown'))
metadata.add_option("--category", action="store", type="string", \
dest="category", help=_("Set the category"), default=_('Unknown'))
metadata.add_option('--title-sort', action='store', default='', dest='title_sort',
help=_('Sort key for the title'))
metadata.add_option('--author-sort', action='store', default='', dest='author_sort',
help=_('Sort key for the author'))
metadata.add_option('--publisher', action='store', default=_('Unknown'), dest='publisher',
help=_('Publisher'))
metadata.add_option('--cover', action='store', dest='cover', default=None, \
help=_('Path to file containing image to be used as cover'))
metadata.add_option('--use-metadata-cover', action='store_true', default=False,
help=_('If there is a cover graphic detected in the source file, use that instead of the specified cover.'))
parser.add_option('-o', '--output', action='store', default=None, \
help=_('Output file name. Default is derived from input filename'))
parser.add_option('--ignore-tables', action='store_true', default=False, dest='ignore_tables',
help=_('Render HTML tables as blocks of text instead of actual tables. This is neccessary if the HTML contains very large or complex tables.'))
laf = parser.add_option_group('LOOK AND FEEL')
laf.add_option('--base-font-size', action='store', type='float', default=10.,
help=_('''Specify the base font size in pts. All fonts are rescaled accordingly. This option obsoletes the --font-delta option and takes precedence over it. To use --font-delta, set this to 0. Default: %defaultpt'''))
laf.add_option('--enable-autorotation', action='store_true', default=False,
help=_('Enable autorotation of images that are wider than the screen width.'),
dest='autorotation')
laf.add_option('--wordspace', dest='wordspace', default=2.5, type='float',
help=_('Set the space between words in pts. Default is %default'))
laf.add_option('--blank-after-para', action='store_true', default=False,
dest='blank_after_para', help=_('Separate paragraphs by blank lines.'))
laf.add_option('--header', action='store_true', default=False, dest='header',
help=_('Add a header to all the pages with title and author.'))
laf.add_option('--headerformat', default="%t by %a", dest='headerformat', type='string',
help=_('Set the format of the header. %a is replaced by the author and %t by the title. Default is %default'))
laf.add_option('--header-separation', default=0, type='int',
help=_('Add extra spacing below the header. Default is %default px.'))
laf.add_option('--override-css', default=None, dest='_override_css', type='string',
help=_('Override the CSS. Can be either a path to a CSS stylesheet or a string. If it is a string it is interpreted as CSS.'))
laf.add_option('--use-spine', default=False, dest='use_spine', action='store_true',
help=_('Use the <spine> element from the OPF file to determine the order in which the HTML files are appended to the LRF. The .opf file must be in the same directory as the base HTML file.'))
laf.add_option('--minimum-indent', default=0, type='float',
help=_('Minimum paragraph indent (the indent of the first line of a paragraph) in pts. Default: %default'))
laf.add_option('--font-delta', action='store', type='float', default=0., \
help=_("""Increase the font size by 2 * FONT_DELTA pts and """
'''the line spacing by FONT_DELTA pts. FONT_DELTA can be a fraction.'''
"""If FONT_DELTA is negative, the font size is decreased."""),
dest='font_delta')
laf.add_option('--ignore-colors', action='store_true', default=False, dest='ignore_colors',
help=_('Render all content as black on white instead of the colors specified by the HTML or CSS.'))
page = parser.add_option_group('PAGE OPTIONS')
profiles = profile_map.keys()
page.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice',
choices=profiles, action='callback', callback=profile_from_string,
help=_('''Profile of the target device for which this LRF is '''
'''being generated. The profile determines things like the '''
'''resolution and screen size of the target device. '''
'''Default: %s Supported profiles: ''')%(PRS500_PROFILE.name,)+\
', '.join(profiles))
page.add_option('--left-margin', default=20, dest='left_margin', type='int',
help=_('''Left margin of page. Default is %default px.'''))
page.add_option('--right-margin', default=20, dest='right_margin', type='int',
help=_('''Right margin of page. Default is %default px.'''))
page.add_option('--top-margin', default=10, dest='top_margin', type='int',
help=_('''Top margin of page. Default is %default px.'''))
page.add_option('--bottom-margin', default=0, dest='bottom_margin', type='int',
help=_('''Bottom margin of page. Default is %default px.'''))
page.add_option('--render-tables-as-images', default=False, action='store_true',
help=_('Render tables in the HTML as images (useful if the document has large or complex tables)'))
page.add_option('--text-size-multiplier-for-rendered-tables', type='float', default=1.0,
help=_('Multiply the size of text in rendered tables by this factor. Default is %default'))
link = parser.add_option_group('LINK PROCESSING OPTIONS')
link.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
dest='link_levels',
help=_(r'''The maximum number of levels to recursively process '''
'''links. A value of 0 means thats links are not followed. '''
'''A negative value means that <a> tags are ignored.'''))
link.add_option('--link-exclude', dest='link_exclude', default='@',
help=_('''A regular expression. <a> tags whose href '''
'''matches will be ignored. Defaults to %default'''))
link.add_option('--no-links-in-toc', action='store_true', default=False,
dest='no_links_in_toc',
help=_('''Don't add links to the table of contents.'''))
chapter = parser.add_option_group('CHAPTER OPTIONS')
chapter.add_option('--disable-chapter-detection', action='store_true',
default=False, dest='disable_chapter_detection',
help=_('''Prevent the automatic detection chapters.'''))
chapter.add_option('--chapter-regex', dest='chapter_regex',
default='chapter|book|appendix',
help=_('''The regular expression used to detect chapter titles.'''
''' It is searched for in heading tags (h1-h6). Defaults to %default'''))
chapter.add_option('--chapter-attr', default='$,,$',
help=_('Detect a chapter beginning at an element having the specified attribute. The format for this option is tagname regexp,attribute name,attribute value regexp. For example to match all heading tags that have the attribute class="chapter" you would use "h\d,class,chapter". You can set the attribute to "none" to match only on tag names. So for example, to match all h2 tags, you would use "h2,none,". Default is %default'''))
chapter.add_option('--page-break-before-tag', dest='page_break', default='h[12]',
help=_('''If html2lrf does not find any page breaks in the '''
'''html file and cannot detect chapter headings, it will '''
'''automatically insert page-breaks before the tags whose '''
'''names match this regular expression. Defaults to %default. '''
'''You can disable it by setting the regexp to "$". '''
'''The purpose of this option is to try to ensure that '''
'''there are no really long pages as this degrades the page '''
'''turn performance of the LRF. Thus this option is ignored '''
'''if the current page has only a few elements.'''))
chapter.add_option('--force-page-break-before-tag', dest='force_page_break',
default='$', help=_('Force a page break before tags whose names match this regular expression.'))
chapter.add_option('--force-page-break-before-attr', dest='force_page_break_attr',
default='$,,$', help=_('Force a page break before an element having the specified attribute. The format for this option is tagname regexp,attribute name,attribute value regexp. For example to match all heading tags that have the attribute class="chapter" you would use "h\d,class,chapter". Default is %default'''))
chapter.add_option('--add-chapters-to-toc', action='store_true',
default=False, dest='add_chapters_to_toc',
help=_('''Add detected chapters to the table of contents.'''))
prepro = parser.add_option_group('PREPROCESSING OPTIONS')
prepro.add_option('--baen', action='store_true', default=False, dest='baen',
help=_('''Preprocess Baen HTML files to improve generated LRF.'''))
prepro.add_option('--pdftohtml', action='store_true', default=False, dest='pdftohtml',
help=_('''You must add this option if processing files generated by pdftohtml, otherwise conversion will fail.'''))
prepro.add_option('--book-designer', action='store_true', default=False, dest='book_designer',
help=_('''Use this option on html0 files from Book Designer.'''))
fonts = parser.add_option_group('FONT FAMILIES',
_('''Specify trutype font families for serif, sans-serif and monospace fonts. '''
'''These fonts will be embedded in the LRF file. Note that custom fonts lead to '''
'''slower page turns. '''
'''For example: '''
'''--serif-family "Times New Roman"
'''))
fonts.add_option('--serif-family',
default=None, dest='serif_family', type='string',
help=_('The serif family of fonts to embed'))
fonts.add_option('--sans-family',
default=None, dest='sans_family', type='string',
help=_('The sans-serif family of fonts to embed'))
fonts.add_option('--mono-family',
default=None, dest='mono_family', type='string',
help=_('The monospace family of fonts to embed'))
debug = parser.add_option_group('DEBUG OPTIONS')
debug.add_option('--verbose', dest='verbose', action='store_true', default=False,
help=_('''Be verbose while processing'''))
debug.add_option('--lrs', action='store_true', dest='lrs', \
help=_('Convert to LRS'), default=False)
parser.add_option('--minimize-memory-usage', action='store_true', default=False,
help=_('Minimize memory usage at the cost of longer processing times. Use this option if you are on a memory constrained machine.'))
parser.add_option('--encoding', default=None,
help=_('Specify the character encoding of the source file. If the output LRF file contains strange characters, try changing this option. A common encoding for files from windows computers is cp-1252. Another common choice is utf-8. The default is to try and guess the encoding.'))
return parser
name = 'prs500'
def find_custom_fonts(options, logger):
from calibre.utils.fontconfig import files_for_family
@ -238,16 +48,16 @@ def find_custom_fonts(options, logger):
f = family(options.sans_family)
fonts['sans'] = files_for_family(f)
if not fonts['sans']:
logger.warn('Unable to find sans family %s'%f)
logger.warn('Unable to find sans family %s'%f)
if options.mono_family:
f = family(options.mono_family)
fonts['mono'] = files_for_family(f)
if not fonts['mono']:
logger.warn('Unable to find mono family %s'%f)
logger.warn('Unable to find mono family %s'%f)
return fonts
def Book(options, logger, font_delta=0, header=None,
def Book(options, logger, font_delta=0, header=None,
profile=PRS500_PROFILE, **settings):
ps = {}
ps['topmargin'] = options.top_margin
@ -258,7 +68,7 @@ def Book(options, logger, font_delta=0, header=None,
- profile.fudge
if header:
hdr = Header()
hb = TextBlock(textStyle=TextStyle(align='foot',
hb = TextBlock(textStyle=TextStyle(align='foot',
fontsize=int(profile.header_font_size*10)),
blockStyle=BlockStyle(blockwidth=ps['textwidth']))
hb.append(header)
@ -269,20 +79,20 @@ def Book(options, logger, font_delta=0, header=None,
ps['topmargin'] = 0
ps['textheight'] = profile.screen_height - (options.bottom_margin + ps['topmargin']) \
- ps['headheight'] - ps['headsep'] - profile.fudge
fontsize = int(10*profile.font_size+font_delta*20)
baselineskip = fontsize + 20
fonts = find_custom_fonts(options, logger)
tsd = dict(fontsize=fontsize,
parindent=int(10*profile.parindent),
tsd = dict(fontsize=fontsize,
parindent=int(10*profile.parindent),
linespace=int(10*profile.line_space),
baselineskip=baselineskip,
wordspace=10*options.wordspace)
if fonts['serif'] and fonts['serif'].has_key('normal'):
tsd['fontfacename'] = fonts['serif']['normal'][1]
book = _Book(textstyledefault=tsd,
pagestyledefault=ps,
book = _Book(textstyledefault=tsd,
pagestyledefault=ps,
blockstyledefault=dict(blockwidth=ps['textwidth']),
bookid=uuid4().hex,
**settings)
@ -291,7 +101,7 @@ def Book(options, logger, font_delta=0, header=None,
for font in fonts[family].values():
book.embed_font(*font)
FONT_FILE_MAP[font[1]] = font[0]
for family in ['serif', 'sans', 'mono']:
if not fonts[family]:
fonts[family] = { 'normal' : (None, profile.default_fonts[family]) }
@ -299,4 +109,3 @@ def Book(options, logger, font_delta=0, header=None,
raise ConversionError, 'Could not find the normal version of the ' + family + ' font'
return book, fonts
from calibre import entity_to_unicode

View File

@ -1,2 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@ -1,199 +0,0 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Convert any ebook file into a LRF file.'''
import sys, os, logging, shutil, tempfile, re
from calibre.ebooks import UnknownFormatError
from calibre.ebooks.lrf import option_parser as _option_parser
from calibre import __appname__, setup_cli_handlers, extract
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.lrf.lit.convert_from import process_file as lit2lrf
from calibre.ebooks.lrf.pdf.convert_from import process_file as pdf2lrf
from calibre.ebooks.lrf.rtf.convert_from import process_file as rtf2lrf
from calibre.ebooks.lrf.txt.convert_from import process_file as txt2lrf
from calibre.ebooks.lrf.html.convert_from import process_file as html2lrf
from calibre.ebooks.lrf.epub.convert_from import process_file as epub2lrf
from calibre.ebooks.lrf.mobi.convert_from import process_file as mobi2lrf
from calibre.ebooks.lrf.fb2.convert_from import process_file as fb22lrf
from calibre.customize.ui import run_plugins_on_postprocess, run_plugins_on_preprocess
def largest_file(files):
maxsize, file = 0, None
for f in files:
size = os.stat(f).st_size
if size > maxsize:
maxsize = size
file = f
return file
def find_htmlfile(dir):
ext_pat = re.compile(r'\.(x){0,1}htm(l){0,1}', re.IGNORECASE)
toc_pat = re.compile(r'toc', re.IGNORECASE)
index_pat = re.compile(r'index', re.IGNORECASE)
toc_files, index_files, files = [], [], []
for root, dirs, _files in os.walk(dir):
for f in _files:
f = os.path.abspath(os.path.join(root, f))
ext = os.path.splitext(f)[1]
if ext and ext_pat.match(ext):
toc_files.append(f) if toc_pat.search(f) else \
index_files.append(f) if index_pat.search(f) else \
files.append(f)
a = toc_files if toc_files else index_files if index_files else files
if a:
return largest_file(a)
def number_of_unhidden_files(base, listing):
ans = 0
for i in listing:
i = os.path.join(base, i)
if os.path.isdir(i) or os.path.basename(i).startswith('.'):
continue
ans += 1
return ans
def unhidden_directories(base, listing):
ans = []
for i in listing:
if os.path.isdir(os.path.join(base, i)) and not i.startswith('__') and \
not i.startswith('.'):
ans.append(i)
return ans
def traverse_subdirs(tdir):
temp = os.listdir(tdir)
if number_of_unhidden_files(tdir, temp) == 0:
try:
cdir = os.path.join(tdir, unhidden_directories(tdir, temp)[0])
return traverse_subdirs(cdir)
except IndexError:
pass
return tdir
def handle_archive(path):
tdir = tempfile.mkdtemp(prefix=__appname__+'_'+'archive_')
extract(path, tdir)
files = []
cdir = traverse_subdirs(tdir)
file = None
exts = ['lit', 'rtf', 'fb2','pdf', 'txt', 'epub', 'mobi', 'prc']
candidates = map(lambda x:os.path.join(cdir, x), os.listdir(cdir))
for ext in exts:
for f in candidates:
if f.lower().endswith('.'+ext):
files.append(f)
file = largest_file(files)
if not file:
file = find_htmlfile(cdir)
if isinstance(file, str):
file = file.decode(sys.getfilesystemencoding())
return tdir, file
def odt2lrf(path, options, logger):
from calibre.ebooks.odt.to_oeb import Extract
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('odt2lrf')
setup_cli_handlers(logger, level)
with TemporaryDirectory('_odt2lrf') as tdir:
opf = Extract()(path, tdir)
options.use_spine = True
options.encoding = 'utf-8'
html_process_file(opf.replace('metadata.opf', 'index.html'), options, logger)
def process_file(path, options, logger=None):
path = os.path.abspath(os.path.expanduser(path))
path = run_plugins_on_preprocess(path)
tdir = None
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('any2lrf')
setup_cli_handlers(logger, level)
if not os.access(path, os.R_OK):
logger.critical('Cannot read from %s', path)
return 1
ext = os.path.splitext(path)[1]
if not ext or ext == '.':
logger.critical('Unknown file type: %s', path)
return 1
ext = ext[1:].lower()
cwd = os.getcwd()
if not options.output:
fmt = '.lrs' if options.lrs else '.lrf'
options.output = os.path.splitext(os.path.basename(path))[0] + fmt
options.output = os.path.abspath(os.path.expanduser(options.output))
if ext in ['zip', 'rar', 'oebzip']:
newpath = None
try:
tdir, newpath = handle_archive(path)
except:
logger.exception(' ')
if not newpath:
raise UnknownFormatError('Could not find ebook in archive')
path = newpath
logger.info('Found ebook in archive: %s', repr(path))
try:
ext = os.path.splitext(path)[1][1:].lower()
convertor = None
if 'htm' in ext:
convertor = html2lrf
elif 'lit' == ext:
convertor = lit2lrf
elif 'pdf' == ext:
convertor = pdf2lrf
elif 'rtf' == ext:
convertor = rtf2lrf
elif 'txt' == ext:
convertor = txt2lrf
elif 'epub' == ext:
convertor = epub2lrf
elif ext in ['mobi', 'prc', 'azw']:
convertor = mobi2lrf
elif ext == 'fb2':
convertor = fb22lrf
elif ext == 'odt':
convertor = odt2lrf
if not convertor:
raise UnknownFormatError(_('Converting from %s to LRF is not supported.')%ext)
convertor(path, options, logger)
finally:
os.chdir(cwd)
if tdir and os.path.exists(tdir):
shutil.rmtree(tdir)
return 0
def option_parser(gui_mode=False):
return _option_parser(usage=_('''\
any2lrf [options] myfile
Convert any ebook format into LRF. Supported formats are:
LIT, RTF, TXT, HTML, EPUB, MOBI, PRC and PDF. any2lrf will also process a RAR or
ZIP archive, looking for an ebook inside the archive.
'''), gui_mode=gui_mode)
def main(args=sys.argv, logger=None, gui_mode=False):
parser = option_parser(gui_mode)
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print _('No file to convert specified.')
return 1
src = args[1]
if not isinstance(src, unicode):
src = src.decode(sys.getfilesystemencoding())
return process_file(src, options, logger)
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,3 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@ -1,75 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, sys, shutil, logging
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks import ConversionError, DRMError
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.ebooks.metadata.opf import OPF
from calibre.ebooks.metadata.epub import OCFDirReader
from calibre.utils.zipfile import ZipFile
from calibre import setup_cli_handlers
from calibre.ptempfile import PersistentTemporaryDirectory
def option_parser():
return lrf_option_parser(
_('''Usage: %prog [options] mybook.epub
%prog converts mybook.epub to mybook.lrf''')
)
def generate_html(pathtoepub, logger):
if not os.access(pathtoepub, os.R_OK):
raise ConversionError('Cannot read from ' + pathtoepub)
tdir = PersistentTemporaryDirectory('_epub2lrf')
#os.rmdir(tdir)
try:
ZipFile(pathtoepub).extractall(tdir)
except:
raise ConversionError, '.epub extraction failed'
if os.path.exists(os.path.join(tdir, 'META-INF', 'encryption.xml')):
raise DRMError(os.path.basename(pathtoepub))
return tdir
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('epub2lrf')
setup_cli_handlers(logger, level)
epub = os.path.abspath(os.path.expanduser(path))
tdir = generate_html(epub, logger)
try:
ocf = OCFDirReader(tdir)
htmlfile = ocf.opf.spine[0].path
options.opf = os.path.join(tdir, ocf.container[OPF.MIMETYPE])
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
options.use_spine = True
html_process_file(htmlfile, options, logger=logger)
finally:
try:
shutil.rmtree(tdir)
except:
logger.warning('Failed to delete temporary directory '+tdir)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No epub file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,4 +0,0 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@ -1,59 +0,0 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Convert web feeds to LRF files.
'''
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lrf.html.convert_from import process_file
from calibre.web.feeds.main import option_parser as feeds_option_parser
from calibre.web.feeds.main import run_recipe
from calibre.ptempfile import TemporaryDirectory
from calibre import sanitize_file_name, strftime
import sys, os
def option_parser():
parser = feeds_option_parser()
parser.remove_option('--output-dir')
parser.remove_option('--lrf')
parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk'))
lrf_parser = lrf_option_parser('')
lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf'))
parser.merge(lrf_parser)
return parser
def main(args=sys.argv, notification=None, handler=None):
parser = option_parser()
opts, args = parser.parse_args(args)
opts.lrf = True
if len(args) != 2 and opts.feeds is None:
parser.print_help()
return 1
recipe_arg = args[1] if len(args) > 1 else None
with TemporaryDirectory('_feeds2lrf') as tdir:
opts.output_dir = tdir
recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler)
htmlfile = os.path.join(tdir, 'index.html')
if not os.access(htmlfile, os.R_OK):
raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg)
lparser = lrf_option_parser('')
ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0]
parser.merge_options(ropts, opts)
if not opts.output:
ext = '.lrs' if opts.lrs else '.lrf'
fname = recipe.title + strftime(recipe.timefmt)+ext
opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname))
print 'Generating LRF...'
process_file(htmlfile, opts)
return 0
if __name__ == '__main__':
sys.exit(main())

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@ -1,90 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, sys, shutil, glob, logging
from tempfile import mkdtemp
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks import ConversionError
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.ebooks.metadata.opf import OPFReader
from calibre import __appname__, setup_cli_handlers
def option_parser():
parser = lrf_option_parser(
_('''Usage: %prog [options] mybook.lit
%prog converts mybook.lit to mybook.lrf''')
)
return parser
def generate_html(pathtolit, logger):
if not os.access(pathtolit, os.R_OK):
raise ConversionError, 'Cannot read from ' + pathtolit
tdir = mkdtemp(prefix=__appname__+'_'+'lit2oeb_')
lr = LitReader(pathtolit)
print 'Extracting LIT file to', tdir
lr.extract_content(tdir)
return tdir
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('lit2lrf')
setup_cli_handlers(logger, level)
lit = os.path.abspath(os.path.expanduser(path))
tdir = generate_html(lit, logger)
try:
opf = glob.glob(os.path.join(tdir, '*.opf'))
if opf:
path = opf[0]
opf = OPFReader(path)
htmlfile = opf.spine[0].path.replace('&', '%26') #convertlit replaces & with %26
options.opf = path
else:
l = glob.glob(os.path.join(tdir, '*toc*.htm*'))
if not l:
l = glob.glob(os.path.join(tdir, '*top*.htm*'))
if not l:
l = glob.glob(os.path.join(tdir, '*contents*.htm*'))
if not l:
l = glob.glob(os.path.join(tdir, '*.htm*'))
if not l:
l = glob.glob(os.path.join(tdir, '*.txt*')) # Some lit file apparently have .txt files in them
if not l:
raise ConversionError('Conversion of lit to html failed. Cannot find html file.')
maxsize, htmlfile = 0, None
for c in l:
sz = os.path.getsize(c)
if sz > maxsize:
maxsize, htmlfile = sz, c
else:
htmlfile = l[0]
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
options.use_spine = True
html_process_file(htmlfile, options, logger=logger)
finally:
try:
shutil.rmtree(tdir)
except:
logger.warning('Failed to delete temporary directory '+tdir)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No lit file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,63 +0,0 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''''''
import sys, tempfile, os, logging, shutil
from calibre import setup_cli_handlers, __appname__
from calibre.ebooks.mobi.reader import MobiReader
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
def generate_html(mobifile, tdir):
mr = MobiReader(mobifile)
mr.extract_content(tdir)
return mr.htmlfile
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('lit2lrf')
setup_cli_handlers(logger, level)
mobi = os.path.abspath(os.path.expanduser(path))
tdir = tempfile.mkdtemp('mobi2lrf', __appname__)
try:
htmlfile = generate_html(mobi, tdir)
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
options.use_spine = True
html_process_file(htmlfile, options, logger=logger)
finally:
try:
shutil.rmtree(tdir)
except:
logger.warning('Failed to delete temporary directory '+tdir)
def option_parser():
return lrf_option_parser(
_('''Usage: %prog [options] mybook.mobi|prc
%prog converts mybook.mobi to mybook.lrf''')
)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No mobi file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -2,7 +2,8 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import struct, array, zlib, cStringIO, collections, re
from calibre.ebooks.lrf import LRFParseError, PRS500_PROFILE, entity_to_unicode
from calibre.ebooks.lrf import LRFParseError, PRS500_PROFILE
from calibre import entity_to_unicode
from calibre.ebooks.lrf.tags import Tag
ruby_tags = {

View File

@ -0,0 +1,135 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os
from calibre.customize.conversion import OutputFormatPlugin
from calibre.customize.conversion import OptionRecommendation
class LRFOptions(object):
def __init__(self, output, opts, oeb):
def f2s(f):
try:
return unicode(f[0])
except:
return ''
m = oeb.metadata
self.title = None
self.author = self.publisher = _('Unknown')
self.freetext = f2s(m.description)
self.category = f2s(m.tags)
self.title_sort = self.author_sort = ''
self.cover = None
self.use_metadata_cover = True
self.output = output
self.ignore_tables = opts.linearize_tables
self.base_font_size = 0
self.blank_after_para = opts.insert_blank_line
self.use_spine = True
self.font_delta = 0
self.ignore_colors = False
from calibre.ebooks.lrf import PRS500_PROFILE
self.profile = PRS500_PROFILE
self.link_levels = sys.maxint
self.link_exclude = '@'
self.no_links_in_toc = True
self.disable_chapter_detection = True
self.chapter_regex = 'dsadcdswcdec'
self.chapter_attr = '$,,$'
self.override_css = self._override_css = ''
self.page_break = 'h[12]'
self.force_page_break = '$'
self.force_page_break_attr = '$'
self.add_chapters_to_toc = False
self.baen = self.pdftohtml = self.book_designer = False
self.verbose = opts.verbose
self.encoding = 'utf-8'
self.lrs = False
self.minimize_memory_usage = False
self.autorotation = opts.enable_autorotation
for x in ('top', 'bottom', 'left', 'right'):
setattr(self, x+'_margin', (self.profile.dpi/72.) * getattr(opts,
'margin_'+x))
for x in ('wordspace', 'header', 'header_format',
'header_separation', 'minimum_indent', 'serif_family',
'render_tables_as_images', 'sans_family', 'mono_family',
'text_size_multiplier_for_rendered_tables'):
setattr(self, x, getattr(opts, x))
class LRFOutput(OutputFormatPlugin):
name = 'LRF Output'
author = 'Kovid Goyal'
file_type = 'lrf'
options = set([
OptionRecommendation(name='enable_autorotation', recommended_value=False,
help=_('Enable autorotation of images that are wider than the screen width.')
),
OptionRecommendation(name='wordspace',
recommended_value=2.5, level=OptionRecommendation.LOW,
help=_('Set the space between words in pts. Default is %default')
),
OptionRecommendation(name='header', recommended_value=False,
help=_('Add a header to all the pages with title and author.')
),
OptionRecommendation(name='header_format', recommended_value="%t by %a",
help=_('Set the format of the header. %a is replaced by the author '
'and %t by the title. Default is %default')
),
OptionRecommendation(name='header_separation', recommended_value=0,
help=_('Add extra spacing below the header. Default is %default px.')
),
OptionRecommendation(name='minimum_indent', recommended_value=0,
help=_('Minimum paragraph indent (the indent of the first line '
'of a paragraph) in pts. Default: %default')
),
OptionRecommendation(name='render_tables_as_images',
recommended_value=False,
help=_('Render tables in the HTML as images (useful if the '
'document has large or complex tables)')
),
OptionRecommendation(name='text_size_multiplier_for_rendered_tables',
recommended_value=1,
help=_('Multiply the size of text in rendered tables by this '
'factor. Default is %default')
),
OptionRecommendation(name='serif_family', recommended_value=None,
help=_('The serif family of fonts to embed')
),
OptionRecommendation(name='sans_family', recommended_value=None,
help=_('The sans-serif family of fonts to embed')
),
OptionRecommendation(name='mono_family', recommended_value=None,
help=_('The monospace family of fonts to embed')
),
])
recommendations = set([
('dont_justify', True, OptionRecommendation.HIGH),
])
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
lrf_opts = LRFOptions(output_path, opts, oeb)
from calibre.ptempfile import TemporaryDirectory
with TemporaryDirectory('_lrf_output') as tdir:
from calibre.customize.ui import plugin_for_output_format
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
from calibre.ebooks.lrf.html.convert_from import process_file
process_file(os.path.join(tdir, opf), lrf_opts, self.log)

View File

@ -1,2 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@ -1,131 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''''''
import sys, os, subprocess, logging
import errno
from functools import partial
from calibre import isosx, setup_cli_handlers, filename_to_utf8, iswindows, islinux
from calibre.ebooks import ConversionError, DRMError
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.metadata.pdf import get_metadata
PDFTOHTML = 'pdftohtml'
popen = subprocess.Popen
if isosx and hasattr(sys, 'frameworks_dir'):
PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML)
if iswindows and hasattr(sys, 'frozen'):
PDFTOHTML = os.path.join(os.path.dirname(sys.executable), 'pdftohtml.exe')
popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up
if islinux and getattr(sys, 'frozen_path', False):
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
def generate_html(pathtopdf, tdir):
'''
Convert the pdf into html.
@return: Path to a temporary file containing the HTML.
'''
if isinstance(pathtopdf, unicode):
pathtopdf = pathtopdf.encode(sys.getfilesystemencoding())
if not os.access(pathtopdf, os.R_OK):
raise ConversionError, 'Cannot read from ' + pathtopdf
index = os.path.join(tdir, 'index.html')
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
pathtopdf = os.path.abspath(pathtopdf)
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
'-nodrm', pathtopdf, os.path.basename(index))
cwd = os.getcwd()
try:
os.chdir(tdir)
try:
p = popen(cmd, stderr=subprocess.PIPE)
except OSError, err:
if err.errno == 2:
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True)
else:
raise
'''
print p.stdout.read()
'''
while True:
try:
ret = p.wait()
break
except OSError, e:
if e.errno == errno.EINTR:
continue
else:
raise
if ret != 0:
err = p.stderr.read()
raise ConversionError, err
if not os.path.exists(index) or os.stat(index).st_size < 100:
raise DRMError()
raw = open(index, 'rb').read()
open(index, 'wb').write('<!-- created by calibre\'s pdftohtml -->\n'+raw)
if not '<br' in raw[:4000]:
raise ConversionError(os.path.basename(pathtopdf) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True)
try:
mi = get_metadata(open(pathtopdf, 'rb'))
except:
mi = MetaInformation(None, None)
if not mi.title:
mi.title = os.path.splitext(os.path.basename(pathtopdf))[0]
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(tdir, mi)
opf.create_manifest([('index.html', None)])
opf.create_spine(['index.html'])
opf.render(open('metadata.opf', 'wb'))
finally:
os.chdir(cwd)
return index
def option_parser():
return lrf_option_parser(
_('''%prog [options] mybook.pdf
%prog converts mybook.pdf to mybook.lrf''')
)
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('pdf2lrf')
setup_cli_handlers(logger, level)
pdf = os.path.abspath(os.path.expanduser(path))
tdir = PersistentTemporaryDirectory('_pdf2lrf')
htmlfile = generate_html(pdf, tdir)
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
else:
options.output = os.path.abspath(options.output)
options.pdftohtml = True
if not options.title:
options.title = filename_to_utf8(os.path.splitext(os.path.basename(options.output))[0])
html_process_file(htmlfile, options, logger)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No pdf file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,426 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Convert PDF to a reflowable format using pdftoxml.exe as the PDF parsing backend.
'''
import sys, os, re, tempfile, subprocess, atexit, shutil, logging, xml.parsers.expat
from xml.etree.ElementTree import parse
from calibre import isosx, setup_cli_handlers, __appname__
from calibre.utils.config import OptionParser
from calibre.ebooks import ConversionError
PDFTOXML = 'pdftoxml.exe'
if isosx and hasattr(sys, 'frameworks_dir'):
PDFTOXML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOXML)
class StyleContainer(object):
def set_style(self, iterator):
styles = set([])
for tok in iterator:
if hasattr(tok, 'style'):
styles.add(tok.style)
counts = [0*i for i in range(len(styles))]
for i in range(len(styles)):
counts[i] = sum([1 for j in self if j.style == styles[i]])
max = max(counts)
for i in range(len(counts)):
if counts[i] == max:
break
self.style = styles[i]
for obj in iterator:
if obj.style == self.style:
obj.style = None
class Page(object):
def __init__(self, attrs):
for a in ('number', 'width', 'height'):
setattr(self, a, float(attrs[a]))
self.id = attrs['id']
self.current_line = None
self.lines = []
def end_line(self):
if self.current_line is not None:
self.current_line.finalize()
self.lines.append(self.current_line)
self.current_line = None
def finalize(self):
self.identify_groups()
self.look_for_page_break()
def identify_groups(self):
groups = []
in_group = False
for i in range(len(self.lines)):
if not in_group:
groups.append(i)
in_group = True
else:
pl = self.lines[i-1]
cl = self.lines[i]
if cl.left != pl.left and cl.width != pl.width:
groups.append(i)
self.groups = []
for i in range(len(groups)):
start = groups[i]
if i +1 == len(groups):
stop = len(self.lines)
else:
stop = groups[i+i]
self.groups.append(self.lines[start:stop])
if len(self.groups) > 1:
self.group[0].test_header(self.width, self.height)
self.groups[-1].test_footer(self.width, self.height)
def look_for_page_break(self):
max = 0
for g in self.groups:
if not g.is_footer and g.bottom > max:
max = g.bottom
self.page_break_after = max < 0.8*self.height
class Group(StyleContainer):
def __init__(self, lines):
self.lines = lines
self.set_style(self.lines)
self.width = max([i.width for i in self.lines])
self.bottom = max([i.bottom for i in self.lines])
tot, ltot = 0, 0
for i in range(1, len(self.lines)):
bot = self.lines[i-1].bottom
top = self.lines[i].top
tot += abs(top - bot)
ltot += self.lines[i].left
self.average_line_spacing = tot/float(len(self.lines)-1)
ltot += self.lines[0].left
self.average_left_margin = ltot/float(len(self.lines))
self.left_margin = min([i.left for i in self.lines])
self.detect_paragraphs()
def detect_paragraphs(self):
if not self.lines:
return
indent_buffer = 5
self.lines[0].is_para_start = self.lines[0].left > self.average_left_margin+indent_buffer
for i in range(1, len(self.lines)):
pl, l = self.lines[i-1:i+1]
c1 = pl.bottom - l.top > self.average_line_spacing
c2 = l.left > self.average_left_margin+indent_buffer
c3 = pl.width < 0.8 * self.width
l.is_para_start = c1 or c2 or c3
def test_header(self, page_width, page_height):
self.is_header = len(self.lines) == 1 and self.lines[0].width < 0.5*page_width
def test_footer(self, page_width, page_height):
self.is_footer = len(self.lines) == 1 and self.lines[0].width < 0.5*page_width
class Text(object):
def __init__(self, attrs):
for a in ('x', 'y', 'width', 'height'):
setattr(self, a, float(attrs[a]))
self.id = attrs['id']
self.objects = []
def add_token(self, tok):
if not self.objects:
self.objects.append(tok)
else:
ptok = self.objects[-1]
if tok == ptok:
ptok.text += ' ' + tok.text
else:
self.objects.append(tok)
def add(self, object):
if isinstance(object, Token):
self.add_token(object)
else:
print 'WARNING: Unhandled object', object.__class__.__name__
def to_xhtml(self):
res = []
for obj in self.objects:
if isinstance(obj, Token):
res.append(obj.to_xhtml())
return ' '.join(res)
class Line(list, StyleContainer):
def calculate_geometry(self):
self.left = self[0].x
self.width = self[-1].x + self[-1].width - self.left
self.top = min(o.y for o in self)
self.bottom = max(o.height+o.y for o in self)
def finalize(self):
self.calculate_geometry()
self.set_style(self)
def to_xhtml(self, group_id):
ans = '<span class="%s" '%group_id
if self.style is not None:
ans += 'style="%s"'%self.style.to_css(inline=True)
ans += '>%s</span>'
res = []
for object in self:
if isinstance(object, Text):
res.append(object.to_xhtml())
return ans%(' '.join(res))
class TextStyle(object):
def __init__(self, tok):
self.bold = tok.bold
self.italic = tok.italic
self.font_name = tok.font_name
self.font_size = tok.font_size
self.color = tok.font_color
def __eq__(self, other):
if isinstance(other, self.__class__):
for a in ('font_size', 'bold', 'italic', 'font_name', 'color'):
if getattr(self, a) != getattr(other, a):
return False
return True
return False
def to_css(self, inline=False):
fw = 'bold' if self.bold else 'normal'
fs = 'italic' if self.italic else 'normal'
fsz = '%dpt'%self.font_size
props = ['font-weight: %s;'%fw, 'font-style: %s;'%fs, 'font-size: %s;'%fsz,
'color: rgb(%d, %d, %d);'%self.color]
joiner = ' '
if not inline:
joiner = '\n'
props = ['{'] + props + ['}']
return joiner.join(props)
class Token(object):
def __init__(self, attrs):
for a in ('x', 'y', 'width', 'height', 'rotation', 'angle', 'font-size'):
setattr(self, a.replace('-', '_'), float(attrs[a]))
for a in ('bold', 'italic'):
setattr(self, a, attrs[a]=='yes')
self.font_name = attrs['font-name']
fc = re.compile(r'#([a-f0-9]{2})([a-f0-9]{2})([a-f0-9]{2})', re.IGNORECASE)
fc = fc.match(attrs['font-color'])
self.font_color = (int(fc.group(1), 16), int(fc.group(2), 16), int(fc.group(3), 16))
self.id = attrs['id']
self.text = u''
self.style = TextStyle(self)
def handle_char_data(self, data):
self.text += data
def __eq__(self, other):
if isinstance(other, self.__class__):
for a in ('rotation', 'angle', 'font_size', 'bold', 'italic', 'font_name', 'font_color'):
if getattr(self, a) != getattr(other, a):
return False
return True
return False
def to_xhtml(self):
if self.style is not None:
ans = u'<span style="%s">%s</span>'%(self.style.to_css(inline=True), self.text)
else:
ans = self.text
return ans
class PDFDocument(object):
SKIPPED_TAGS = ('DOCUMENT', 'METADATA', 'PDFFILENAME', 'PROCESS', 'VERSION',
'COMMENT', 'CREATIONDATE')
def __init__(self, filename):
parser = xml.parsers.expat.ParserCreate('UTF-8')
parser.buffer_text = True
parser.returns_unicode = True
parser.StartElementHandler = self.start_element
parser.EndElementHandler = self.end_element
self.pages = []
self.current_page = None
self.current_token = None
src = open(filename, 'rb').read()
self.parser = parser
parser.Parse(src)
def start_element(self, name, attrs):
if name == 'TOKEN':
self.current_token = Token(attrs)
self.parser.CharacterDataHandler = self.current_token.handle_char_data
elif name == 'TEXT':
text = Text(attrs)
if self.current_page.current_line is None:
self.current_page.current_line = Line()
self.current_page.current_line.append(text)
else:
y, height = self.current_page.current_line[0].y, self.current_page.current_line[0].height
if y == text.y or y+height == text.y + text.height:
self.current_page.current_line.append(text)
else:
self.current_page.end_line()
self.current_page.current_line = Line()
self.current_page.current_line.append(text)
elif name == 'PAGE':
self.current_page = Page(attrs)
elif name.lower() == 'xi:include':
print 'WARNING: Skipping vector image'
elif name in self.SKIPPED_TAGS:
pass
else:
print 'WARNING: Unhandled element', name
def end_element(self, name):
if name == 'TOKEN':
if self.current_token.angle == 0 and self.current_token.rotation == 0:
self.current_page.current_line[-1].add(self.current_token)
self.current_token = None
self.parser.CharacterDataHandler = None
elif name == 'PAGE':
self.current_page.finalize()
self.pages.append(self.current_page)
self.current_page = None
def to_xhtml(self):
header = u'''\
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.w3.org/MarkUp/SCHEMA/xhtml11.xsd" >
<head>
<style type="text/css">
%(style)s
</style>
</head>
<body>
%(body)s
</body>
</html>
'''
res = []
para = []
styles = []
for page in self.pages:
res.append(u'<a name="%s" />'%page.id)
for group in page.groups:
if group.is_header or group.is_footer:
continue
if group.style is not None:
styles.append(u'.%s %s\n'%(group.id, group.style.to_css()))
for line in group.lines:
if line.is_para_start:
indent = group.left_margin - line.left
if para:
res.append(u'<p style="text-indent: %dpt">%s</p>'%(indent, ''.join(para)))
para = []
para.append(line.to_xhtml(group.id))
if page.page_break_after:
res.append(u'<br style="page-break-after:always" />')
if para:
res.append(u'<p>%s</p>'%(''.join(para)))
para = []
return (header%dict(style='\n'.join(styles), body='\n'.join(res))).encode('utf-8')
class PDFConverter(object):
@classmethod
def generate_xml(cls, pathtopdf, logger):
pathtopdf = os.path.abspath(pathtopdf)
tdir = tempfile.mkdtemp('pdf2xml', __appname__)
atexit.register(shutil.rmtree, tdir)
xmlfile = os.path.basename(pathtopdf)+'.xml'
os.chdir(tdir)
cmd = PDFTOXML + ' -outline "%s" "%s"'%(pathtopdf, xmlfile)
p = subprocess.Popen(cmd, shell=True, stderr=subprocess.STDOUT,
stdout=subprocess.PIPE)
log = p.stdout.read()
ret = p.wait()
if ret != 0:
raise ConversionError, log
xmlfile = os.path.join(tdir, xmlfile)
if os.stat(xmlfile).st_size < 20:
raise ConversionError(os.path.basename(pathtopdf) + ' does not allow copying of text.')
return xmlfile
def __init__(self, pathtopdf, logger, opts):
self.cwd = os.getcwdu()
self.logger = logger
self.opts = opts
try:
self.logger.info('Converting PDF to XML')
self.xmlfile = self.generate_xml(pathtopdf, self.logger)
self.tdir = os.path.dirname(self.xmlfile)
self.data_dir = self.xmlfile + '_data'
outline_file = self.xmlfile.rpartition('.')[0]+'_outline.xml'
self.logger.info('Parsing XML')
self.document = PDFDocument(self.xmlfile)
self.outline = parse(outline_file)
finally:
os.chdir(self.cwd)
def convert(self, output_dir):
doc = self.document.to_xhtml()
open(os.path.join(output_dir, 'document.html'), 'wb').write(doc)
def option_parser():
parser = OptionParser(usage=\
'''
%prog [options] myfile.pdf
Convert a PDF file to a HTML file.
''')
parser.add_option('-o', '--output-dir', default='.',
help=_('Path to output directory in which to create the HTML file. Defaults to current directory.'))
parser.add_option('--verbose', default=False, action='store_true',
help=_('Be more verbose.'))
return parser
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args()
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('pdf2html')
setup_cli_handlers(logger, level)
if len(args) != 1:
parser.print_help()
print _('You must specify a single PDF file.')
return 1
options.output_dir = os.path.abspath(options.output_dir)
converter = PDFConverter(os.path.abspath(args[0]), logger, options)
converter.convert(options.output_dir)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,2 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

View File

@ -1,112 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
"""
Convert .txt files to .lrf
"""
import os, sys, codecs, logging, re, shutil
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks import ConversionError
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre.ebooks.markdown import markdown
from calibre import setup_cli_handlers
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator
def option_parser():
parser = lrf_option_parser(
_('''%prog [options] mybook.txt
%prog converts mybook.txt to mybook.lrf'''))
parser.add_option('--debug-html-generation', action='store_true', default=False,
dest='debug_html_generation', help=_('Print generated HTML to stdout and quit.'))
return parser
def fix_image_includes(sdir, tdir, match):
path = match.group(1).split('/')
src = os.path.join(sdir, *path)
dest = os.path.join(tdir, *path)
p = os.path.dirname(dest)
if not os.path.exists(p):
os.makedirs(p)
if not os.path.exists(dest):
shutil.copyfile(src, dest)
def generate_html(txtfile, encoding, tdir):
'''
Convert txtfile to html and return a PersistentTemporaryFile object pointing
to the file with the HTML.
'''
txtfile = os.path.abspath(txtfile)
enc = encoding
if not encoding:
encodings = ['cp1252', 'latin-1', 'utf8', 'iso-8859-1', 'koi8_r', 'koi8_u']
txt, enc = None, None
for encoding in encodings:
try:
txt = codecs.open(txtfile, 'rb', encoding).read()
except UnicodeDecodeError:
continue
enc = encoding
break
if txt == None:
raise ConversionError, 'Could not detect encoding of %s'%(txtfile,)
else:
txt = codecs.open(txtfile, 'rb', enc).read()
print 'Converting text to HTML...'
md = markdown.Markdown(
extensions=['footnotes', 'tables', 'toc'],
safe_mode=False,
)
html = '<html><body>'+md.convert(txt)+'</body></html>'
for match in re.finditer(r'<img\s+[^>]*src="([^"]+)"', html):
fix_image_includes(os.path.dirname(txtfile), tdir, match)
p = os.path.join(tdir, 'index.html')
open(p, 'wb').write(html.encode('utf-8'))
mi = MetaInformation(os.path.splitext(os.path.basename(txtfile))[0], [_('Unknown')])
opf = OPFCreator(tdir, mi)
opf.create_manifest([(os.path.join(tdir, 'index.html'), None)])
opf.create_spine([os.path.join(tdir, 'index.html')])
opf.render(open(os.path.join(tdir, 'metadata.opf'), 'wb'))
return p
def process_file(path, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('txt2lrf')
setup_cli_handlers(logger, level)
txt = os.path.abspath(os.path.expanduser(path))
if not hasattr(options, 'debug_html_generation'):
options.debug_html_generation = False
tdir = PersistentTemporaryDirectory('_txt2lrf')
htmlfile = generate_html(txt, options.encoding, tdir)
options.encoding = 'utf-8'
if not options.debug_html_generation:
options.force_page_break = 'h2'
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
options.output = os.path.abspath(os.path.expanduser(options.output))
if not options.title:
options.title = os.path.splitext(os.path.basename(path))[0]
html_process_file(htmlfile, options, logger)
else:
print open(htmlfile, 'rb').read()
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print 'No txt file specified'
return 1
process_file(args[1], options, logger)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,89 +0,0 @@
Demonstration of `txt2lrf`
==========================
`txt2lrf` provides a convenient way to create LRF files with good formatting.
`txt2lrf` recognizes a simple markup language called *markdown*.
The idea is to provide a lightweight markup that can be used to create
TXT files that can be read by themselves or automatically converted to LRF.
[{@name=toc}]()
<br /><br />
///Table of Contents///
Text formatting
---------------
**Bold** and *italic* text is easily specified.
> Blockquotes are also very simple to specify.
> This is a basic blockquote paragraph. I absolutely
> love block quotes don't you?
This is a preformatted code block. No formatting rules are applied to text in this block and it is rendered in a monospaced font.
For details on the text formatting syntax visit
http://daringfireball.net/projects/markdown/syntax
___
[Table of Contents](#toc)
Lists
-----
Both ordered and unordered lists are supported.
### Unordered lists
+ What a
+ *nice*
+ list
### Ordered lists
1. One
2. Two
3. Three
**Note:** Nested lists are not supported
___
[Table of Contents](#toc)
Tables
------
Simple tables are easily generated
| |* Col 1 *|* Col 2 *|
|* Row 1 *| (1, 1) | (1, 2) |
|* Row 2 *| (2, 1) | (2, 2) |
**Note:** Nested tables are not supported
___
[Table of Contents](#toc)
Images
------
`txt2lrf` also has support for inline images like
![this one](small.jpg) this one.
___
[Table of Contents](#toc)
Automatic TOC Creation
----------------------
By inserting `///Table of Contents///` into the text at some point
a table of contents is automatically generated with links that point
to all headings underlined with `-------`.
___
[Table of Contents](#toc)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.0 KiB

View File

@ -1,6 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
builtin_profiles = []
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]

View File

@ -1,183 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Convert websites into LRF files.'''
import sys, tempfile, shutil, os, logging, imp, inspect, re
from urlparse import urlsplit
from calibre import __appname__, setup_cli_handlers, CommandLineError, strftime
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.lrf.html.convert_from import process_file
from calibre.web.fetch.simple import create_fetcher
from calibre.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile, create_class
from calibre.ebooks.lrf.web import builtin_profiles, available_profiles
def option_parser():
parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n'''
'''%prog downloads a site from the web and converts it '''
'''into a LRF file for use with the SONY Reader. '''
'''website_profile is one of '''+str(available_profiles)+\
''' If you specify a website_profile of default or do not specify '''
'''it, you must specify the --url option.'''
)
parser.add_option('-u', '--url', dest='url', default=None,
help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
parser.add_option('--user-profile', default=None,
help='Path to a python file containing a user created profile. For help visit http://%s.kovidgoyal.net/wiki/UserProfiles'%__appname__)
parser.add_option('--username', dest='username', default=None,
help='Specify the username to be used while downloading. Only used if the profile supports it.')
parser.add_option('--password', dest='password', default=None,
help='Specify the password to be used while downloading. Only used if the profile supports it.')
parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout,
default=None, type='int', dest='timeout')
parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout,
default=None, type='int', dest='max_recursions')
parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files',
help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %d'%DefaultProfile.timeout)
parser.add_option('--delay', default=None, dest='delay', type='int',
help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout)
parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
help='Do not download CSS stylesheets.', dest='no_stylesheets')
parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append',
help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
parser.add_option('--keep-downloaded-files', default=False, action='store_true',
help='''Do not delete the downloaded files after creating the LRF''')
return parser
def fetch_website(options, logger):
tdir = tempfile.mkdtemp(prefix=__appname__+'_', suffix='_web2lrf')
options.dir = tdir
fetcher = create_fetcher(options, logger)
fetcher.preprocess_regexps = options.preprocess_regexps
return fetcher.start_fetch(options.url), tdir
def create_lrf(htmlfile, options, logger):
if not options.author or options.author.lower() == 'unknown':
options.author = __appname__
options.header = True
if options.output:
options.output = os.path.abspath(os.path.expanduser(options.output))
else:
options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf')))
process_file(htmlfile, options, logger)
def process_profile(args, options, logger=None):
tdir = None
try:
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('web2lrf')
setup_cli_handlers(logger, level)
index = -1
if len(args) == 2 and re.search(r'class\s+\S+\(\S+\)\s*\:', args[1]):
profile = create_class(args[1])
else:
if options.user_profile is not None:
path = os.path.abspath(options.user_profile)
name = os.path.splitext(os.path.basename(path))[0]
res = imp.find_module(name, [os.path.dirname(path)])
module = imp.load_module(name, *res)
classes = inspect.getmembers(module,
lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\
and x is not DefaultProfile and x is not FullContentProfile)
if not classes:
raise CommandLineError('Invalid user profile '+path)
builtin_profiles.append(classes[0][1])
available_profiles.append(name)
if len(args) < 2:
args.append(name)
args[1] = name
index = -1
if len(args) == 2:
try:
if isinstance(args[1], basestring):
if args[1] != 'default':
index = available_profiles.index(args[1])
except ValueError:
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles))
else:
raise CommandLineError('Only one profile at a time is allowed.')
profile = DefaultProfile if index == -1 else builtin_profiles[index]
profile = profile(logger, options.verbose, options.username, options.password)
if profile.browser is not None:
options.browser = profile.browser
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
val = getattr(options, opt)
if val is None:
setattr(options, opt, getattr(profile, opt))
if not options.url:
options.url = profile.url
if not options.url:
raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,))
if not options.title:
title = profile.title
if not title:
title = urlsplit(options.url).netloc
options.title = title + strftime(profile.timefmt)
options.match_regexps += profile.match_regexps
options.preprocess_regexps = profile.preprocess_regexps
options.filter_regexps += profile.filter_regexps
options.encoding = profile.encoding if options.encoding is None else options.encoding
if len(args) == 2 and args[1] != 'default':
options.anchor_ids = False
htmlfile, tdir = fetch_website(options, logger)
options.encoding = 'utf-8'
cwd = os.getcwd()
if not options.output:
title = options.title.encode(sys.getfilesystemencoding()) if isinstance(options.title, unicode) else options.title
options.output = os.path.join(cwd, options.title+('.lrs' if options.lrs else '.lrf'))
if not os.path.isabs(options.output):
options.output = os.path.join(cwd, options.output)
option_parser().parse_args(profile.html2lrf_options, options)
try:
os.chdir(os.path.dirname(htmlfile))
create_lrf(os.path.basename(htmlfile), options, logger)
finally:
os.chdir(cwd)
finally:
try:
profile.cleanup()
except:
pass
if tdir and os.path.isdir(tdir):
if options.keep_downloaded_files:
print 'Downloaded files in ', tdir
else:
shutil.rmtree(tdir)
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) > 2 or (len(args) == 1 and not options.user_profile):
parser.print_help()
return 1
try:
process_profile(args, options, logger=logger)
except CommandLineError, err:
print >>sys.stderr, err
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -1,572 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Contains the Base Profiles that can be used to easily create profiles to download
particular websites.
'''
import tempfile, time, calendar, re, operator, atexit, shutil, os
from htmlentitydefs import name2codepoint
from email.utils import formatdate
from calibre import __appname__, iswindows, browser, strftime
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, CData, Tag
class DefaultProfile(object):
#: The title to use for the LRF file
#: @type: string
title = 'Default Profile'
#: Maximum number of articles to download from each feed
#: @type: integer
max_articles_per_feed = 10
#: If True process the <description> element of the feed as HTML
#: @type: boolean
html_description = True
#: How many days old should the oldest article downloaded from the feeds be
#: @type: integer
oldest_article = 7
#: Recommend frequency at which to download this profile. In days.
recommended_frequency = 7
#: Number of levels of links to follow
#: @type: integer
max_recursions = 1
#: Maximum number of files to download
#: @type: integer
max_files = 3000
#: Delay between consecutive downloads in seconds
#: @type: integer
delay = 0
#: Timeout for fetching files from server in seconds
#: @type: integer
timeout = 10
#: The format string for the date shown on the first page
#: @type: string
timefmt = ' [%a %d %b %Y]'
#: The order of elements to search for a URL when parsing the RSS feed. You
#: can replace these elements by completely arbitrary elements to customize
#: feed processing.
#: @type: list of strings
url_search_order = ['guid', 'link']
#: The format string used to parse the publication date in the RSS feed.
#: If set to None some default heuristics are used, these may fail,
#: in which case set this to the correct string or re-implement
#: L{DefaultProfile.strptime} in your subclass.
#: @type: string or None
pubdate_fmt = None
#: If True will look for a publication date for each article.
#: If False assumes the publication date is the current time.
#: @type: boolean
use_pubdate = True,
#: Max number of characters in the short description.
#: Used by L{FullContentProfile}
#: @type: integer
summary_length = 500
#: If True stylesheets are not downloaded and processed
#: Convenient flag to disable loading of stylesheets for websites
#: that have overly complex stylesheets unsuitable for conversion
#: to ebooks formats
#: @type: boolean
no_stylesheets = False
#: If False articles with the same title in the same feed
#: are not downloaded multiple times
#: @type: boolean
allow_duplicates = False
#: If True the GUI will ask the user for a username and password
#: to use while downloading
#: @type: boolean
needs_subscription = False
#: Specify an override encoding for sites that have an incorrect
#: charset specification. THe most common being specifying latin1 and
#: using cp1252
encoding = None
#: List of regular expressions that determines which links to follow
#: If empty, it is ignored.
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
#: @type: list of strings
match_regexps = []
#: List of regular expressions that determines which links to ignore
#: If empty it is ignored
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
#: @type: list of strings
filter_regexps = []
#: List of options to pass to html2lrf, to customize conversion
#: to LRF
#: @type: list of strings
html2lrf_options = []
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the
#: list should be a two element tuple. The first element of the tuple should
#: be a compiled regular expression and the second a callable that takes
#: a single match object and returns a string to replace the match.
#: @type: list of tuples
preprocess_regexps = []
# See the built-in profiles for examples of these settings.
#: The URL of the website
#: @type: string
url = ''
feeds = []
CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL)
def get_feeds(self):
'''
Return a list of RSS feeds to fetch for this profile. Each element of the list
must be a 2-element tuple of the form (title, url).
'''
if not self.feeds:
raise NotImplementedError
return self.feeds
@classmethod
def print_version(cls, url):
'''
Take a URL pointing to an article and returns the URL pointing to the
print version of the article.
'''
return url
@classmethod
def get_browser(cls):
'''
Return a browser instance used to fetch documents from the web.
If your profile requires that you login first, override this method
in your subclass. See for example the nytimes profile.
'''
return browser()
def __init__(self, logger, verbose=False, username=None, password=None, lrf=True):
self.logger = logger
self.username = username
self.password = password
self.verbose = verbose
self.lrf = lrf
self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_')
self.browser = self.get_browser()
try:
self.url = 'file:'+ ('' if iswindows else '//') + self.build_index()
except NotImplementedError:
self.url = None
atexit.register(cleanup, self.temp_dir)
def build_index(self):
'''Build an RSS based index.html'''
articles = self.parse_feeds()
encoding = 'utf-8' if self.encoding is None else self.encoding
def build_sub_index(title, items):
ilist = ''
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
for item in items:
if not item.has_key('date'):
item['date'] = time.strftime('%a, %d %b', time.localtime())
ilist += li%item
return u'''\
<html>
<body>
<h2>%(title)s</h2>
<ul>
%(items)s
</ul>
</body>
</html>
'''%dict(title=title, items=ilist.rstrip())
cnum = 0
clist = ''
categories = articles.keys()
categories.sort()
for category in categories:
cnum += 1
cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
prefix = 'file:' if iswindows else ''
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
src = build_sub_index(category, articles[category])
open(cfile, 'wb').write(src.encode(encoding))
title = self.title
if not isinstance(title, unicode):
title = unicode(title, 'utf-8', 'replace')
src = u'''\
<html>
<body>
<h1>%(title)s</h1>
<div style='text-align: right; font-weight: bold'>%(date)s</div>
<ul>
%(categories)s
</ul>
</body>
</html>
'''%dict(date=strftime('%a, %d %B, %Y'),
categories=clist, title=title)
index = os.path.join(self.temp_dir, 'index.html')
open(index, 'wb').write(src.encode(encoding))
return index
@classmethod
def tag_to_string(cls, tag, use_alt=True):
'''
Convenience method to take a BeautifulSoup Tag and extract the text from it
recursively, including any CDATA sections and alt tag attributes.
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content
@type use_alt: boolean
@return: A unicode (possibly empty) object
@rtype: unicode string
'''
if not tag:
return ''
if isinstance(tag, basestring):
return tag
strings = []
for item in tag.contents:
if isinstance(item, (NavigableString, CData)):
strings.append(item.string)
elif isinstance(item, Tag):
res = cls.tag_to_string(item)
if res:
strings.append(res)
elif use_alt and item.has_key('alt'):
strings.append(item['alt'])
return u''.join(strings)
def get_article_url(self, item):
'''
Return the article URL given an item Tag from a feed, or None if no valid URL is found
@type item: BeatifulSoup.Tag
@param item: A BeautifulSoup Tag instance corresponding to the <item> tag from a feed.
@rtype: string or None
'''
url = None
for element in self.url_search_order:
url = item.find(element.lower())
if url:
break
return url
def parse_feeds(self, require_url=True):
'''
Create list of articles from a list of feeds.
@param require_url: If True skip articles that don't have a link to a HTML page with the full article contents.
@type require_url: boolean
@rtype: dictionary
@return: A dictionary whose keys are feed titles and whose values are each
a list of dictionaries. Each list contains dictionaries of the form::
{
'title' : article title,
'url' : URL of print version,
'date' : The publication date of the article as a string,
'description' : A summary of the article
'content' : The full article (can be an empty string). This is used by FullContentProfile
}
'''
added_articles = {}
feeds = self.get_feeds()
articles = {}
for title, url in feeds:
try:
src = self.browser.open(url).read()
except Exception, err:
self.logger.error('Could not fetch feed: %s\nError: %s'%(url, err))
if self.verbose:
self.logger.exception(' ')
continue
articles[title] = []
added_articles[title] = []
soup = BeautifulStoneSoup(src)
for item in soup.findAll('item'):
try:
atitle = item.find('title')
if not atitle:
continue
atitle = self.tag_to_string(atitle)
if self.use_pubdate:
pubdate = item.find('pubdate')
if not pubdate:
pubdate = item.find('dc:date')
if not pubdate or not pubdate.string:
pubdate = formatdate()
pubdate = self.tag_to_string(pubdate)
pubdate = pubdate.replace('+0000', 'GMT')
url = self.get_article_url(item)
url = self.tag_to_string(url)
if require_url and not url:
self.logger.debug('Skipping article %s as it does not have a link url'%atitle)
continue
purl = url
try:
purl = self.print_version(url)
except Exception, err:
self.logger.debug('Skipping %s as could not find URL for print version. Error:\n%s'%(url, err))
continue
content = item.find('content:encoded')
if not content:
content = item.find('description')
if content:
content = self.process_html_description(content, strip_links=False)
else:
content = ''
d = {
'title' : atitle,
'url' : purl,
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
'date' : pubdate if self.use_pubdate else formatdate(),
'content' : content,
}
delta = time.time() - d['timestamp']
if not self.allow_duplicates:
if d['title'] in added_articles[title]:
continue
added_articles[title].append(d['title'])
if delta > self.oldest_article*3600*24:
continue
except Exception, err:
if self.verbose:
self.logger.exception('Error parsing article:\n%s'%(item,))
continue
try:
desc = ''
for c in item.findAll('description'):
desc = self.tag_to_string(c)
if desc:
break
d['description'] = self.process_html_description(desc) if self.html_description else desc.string
except:
d['description'] = ''
articles[title].append(d)
articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
articles[title] = articles[title][:self.max_articles_per_feed+1]
#for item in articles[title]:
# item.pop('timestamp')
if not articles[title]:
articles.pop(title)
return articles
def cleanup(self):
'''
Called after LRF file has been generated. Use it to do any cleanup like
logging out of subscription sites, etc.
'''
pass
@classmethod
def process_html_description(cls, tag, strip_links=True):
'''
Process a <description> tag that contains HTML markup, either
entity encoded or escaped in a CDATA section.
@return: HTML
@rtype: string
'''
src = '\n'.join(tag.contents) if hasattr(tag, 'contents') else tag
match = cls.CDATA_PAT.match(src.lstrip())
if match:
src = match.group(1)
else:
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
for e in replaced_entities:
ent = '&'+e+';'
src = src.replace(ent, unichr(name2codepoint[e]))
if strip_links:
src = re.compile(r'<a.*?>(.*?)</a>', re.IGNORECASE|re.DOTALL).sub(r'\1', src)
return src
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
FULL_DAY_MAP = dict(Sunday=0, Monday=1, Tueday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6)
MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,
July=7, August=8, September=9, October=10,
November=11, December=12)
@classmethod
def strptime(cls, src):
'''
Take a string and return the date that string represents, in UTC as
an epoch (i.e. number of seconds since Jan 1, 1970). This function uses
a bunch of heuristics and is a prime candidate for being overridden in a
subclass.
@param src: Timestamp as a string
@type src: string
@return: time ans a epoch
@rtype: number
'''
delta = 0
zone = re.search(r'\s*(\+\d\d\:{0,1}\d\d)', src)
if zone:
delta = zone.group(1)
hrs, mins = int(delta[1:3]), int(delta[-2:].rstrip())
delta = 60*(hrs*60 + mins) * (-1 if delta.startswith('-') else 1)
src = src.replace(zone.group(), '')
if cls.pubdate_fmt is None:
src = src.strip().split()
try:
src[0] = str(cls.DAY_MAP[src[0][:-1]])+','
except KeyError:
src[0] = str(cls.FULL_DAY_MAP[src[0][:-1]])+','
try:
src[2] = str(cls.MONTH_MAP[src[2]])
except KeyError:
src[2] = str(cls.FULL_MONTH_MAP[src[2]])
fmt = '%w, %d %m %Y %H:%M:%S'
src = src[:5] # Discard extra information
try:
time_t = time.strptime(' '.join(src), fmt)
except ValueError:
time_t = time.strptime(' '.join(src), fmt.replace('%Y', '%y'))
return calendar.timegm(time_t)-delta
else:
return calendar.timegm(time.strptime(src, cls.pubdate_fmt))
def command_line_options(self):
args = []
args.append('--max-recursions='+str(self.max_recursions))
args.append('--delay='+str(self.delay))
args.append('--max-files='+str(self.max_files))
for i in self.match_regexps:
args.append('--match-regexp="'+i+'"')
for i in self.filter_regexps:
args.append('--filter-regexp="'+i+'"')
return args
class FullContentProfile(DefaultProfile):
'''
This profile is designed for feeds that embed the full article content in the RSS file.
'''
max_recursions = 0
article_counter = 0
def build_index(self):
'''Build an RSS based index.html. '''
articles = self.parse_feeds(require_url=False)
def build_sub_index(title, items):
ilist = ''
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
for item in items:
content = item['content']
if not content:
self.logger.debug('Skipping article as it has no content:%s'%item['title'])
continue
item['description'] = cutoff(item['description'], self.summary_length)+'&hellip;'
self.article_counter = self.article_counter + 1
url = os.path.join(self.temp_dir, 'article%d.html'%self.article_counter)
item['url'] = url
open(url, 'wb').write((u'''\
<html>
<body>
<h2>%s</h2>
<div>
%s
</div>
</body>
</html>'''%(item['title'], content)).encode('utf-8')
)
ilist += li%item
return u'''\
<html>
<body>
<h2>%(title)s</h2>
<ul>
%(items)s
</ul>
</body>
</html>
'''%dict(title=title, items=ilist.rstrip())
cnum = 0
clist = ''
categories = articles.keys()
categories.sort()
for category in categories:
cnum += 1
cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
prefix = 'file:' if iswindows else ''
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
src = build_sub_index(category, articles[category])
open(cfile, 'wb').write(src.encode('utf-8'))
src = '''\
<html>
<body>
<h1>%(title)s</h1>
<div style='text-align: right; font-weight: bold'>%(date)s</div>
<ul>
%(categories)s
</ul>
</body>
</html>
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),
categories=clist, title=self.title)
index = os.path.join(self.temp_dir, 'index.html')
open(index, 'wb').write(src.encode('utf-8'))
return index
def cutoff(src, pos, fuzz=50):
si = src.find(';', pos)
if si > 0 and si-pos > fuzz:
si = -1
gi = src.find('>', pos)
if gi > 0 and gi-pos > fuzz:
gi = -1
npos = max(si, gi)
if npos < 0:
npos = pos
return src[:npos+1]
def create_class(src):
environment = {'FullContentProfile':FullContentProfile, 'DefaultProfile':DefaultProfile}
exec src in environment
for item in environment.values():
if hasattr(item, 'build_index'):
if item.__name__ not in ['DefaultProfile', 'FullContentProfile']:
return item
def cleanup(tdir):
try:
if os.path.isdir(tdir):
shutil.rmtree(tdir)
except:
pass

View File

@ -1,38 +0,0 @@
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class AssociatedPress(DefaultProfile):
title = 'Associated Press'
max_recursions = 2
max_articles_per_feed = 15
html2lrf_options = ['--force-page-break-before-tag="chapter"']
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<body class="apple-rss-no-unread-mode" onLoad="setup(null)">.*?<!-- start Entries -->', lambda match : '<body>'),
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<body.*?>.*?<span class="headline">', lambda match : '<body><span class="headline"><chapter>'),
(r'<tr><td><div class="body">.*?<p class="ap-story-p">', lambda match : '<p class="ap-story-p">'),
(r'<p class="ap-story-p">', lambda match : '<p>'),
(r'Learn more about our <a href="http://apdigitalnews.com/privacy.html">Privacy Policy</a>.*?</body>', lambda match : '</body>'),
]
]
def get_feeds(self):
return [ ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'),
('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'),
('AP World News', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=SCAND&SECTION=HOME'),
('AP Political News', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml?SITE=ORMED&SECTION=HOME'),
('AP Washington State News', 'http://hosted.ap.org/lineups/WASHINGTONHEADS-rss_2.0.xml?SITE=NYPLA&SECTION=HOME'),
('AP Technology News', 'http://hosted.ap.org/lineups/TECHHEADS-rss_2.0.xml?SITE=CTNHR&SECTION=HOME'),
('AP Health News', 'http://hosted.ap.org/lineups/HEALTHHEADS-rss_2.0.xml?SITE=FLDAY&SECTION=HOME'),
('AP Science News', 'http://hosted.ap.org/lineups/SCIENCEHEADS-rss_2.0.xml?SITE=OHCIN&SECTION=HOME'),
('AP Strange News', 'http://hosted.ap.org/lineups/STRANGEHEADS-rss_2.0.xml?SITE=WCNC&SECTION=HOME'),
]

View File

@ -1,47 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Atlantic(DefaultProfile):
title = 'The Atlantic'
max_recursions = 2
INDEX = 'http://www.theatlantic.com/doc/current'
preprocess_regexps = [
(re.compile(r'<body.*?<div id="storytop"', re.DOTALL|re.IGNORECASE),
lambda m: '<body><div id="storytop"')
]
def parse_feeds(self):
articles = []
src = self.browser.open(self.INDEX).read()
soup = BeautifulSoup(src)
issue = soup.find('span', attrs={'class':'issue'})
if issue:
self.timefmt = ' [%s]'%self.tag_to_string(issue).rpartition('|')[-1].strip().replace('/', '-')
for item in soup.findAll('div', attrs={'class':'item'}):
a = item.find('a')
if a and a.has_key('href'):
url = a['href']
url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
title = self.tag_to_string(a)
byline = item.find(attrs={'class':'byline'})
date = self.tag_to_string(byline) if byline else ''
description = ''
articles.append({
'title':title,
'date':date,
'url':url,
'description':description
})
return {'Current Issue' : articles }

View File

@ -1,75 +0,0 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os
from calibre.ebooks.lrf.web.profiles import DefaultProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre import iswindows
from calibre.ebooks.chardet import xml_to_unicode
class AutomaticRSSProfile(DefaultProfile):
'''
Make downloading of RSS feeds completely automatic. Only input
required is the URL of the feed.
'''
max_recursions = 2
def __init__(self, *args, **kwargs):
self.cindex = 1
DefaultProfile.__init__(*args, **kwargs)
def fetch_content(self, index):
raw = open(index, 'rb').read()
if self.encoding:
raw = raw.decode(self.encoding)
enc = self.encoding
else:
raw, enc = xml_to_unicode(raw)
isoup = BeautifulSoup(raw)
for a in isoup.findAll('a', href=True):
src = a['href']
if src.startswith('file:'):
src = src[5:]
if os.access(src, os.R_OK):
self.fetch_content(src)
continue
try:
src = self.browser.open(src).read()
except:
continue
soup = BeautifulSoup(src)
header, content = [], []
head = soup.find('head')
if head is not None:
for style in head('style'):
header.append(unicode(style))
body = soup.find('body')
if body is None:
continue
for tag in body(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
in_table = False
c = tag.parent
while c is not None:
if c.name == 'table':
in_table = True
break
c = c.parent
if in_table:
continue
content.append(unicode(tag))
cfile = 'content%d.html'%self.cindex
self.cindex += 1
cfile = os.path.join(os.path.dirname(index), cfile)
html = '<html>\n<head>%s</head>\n<body>%s</body></html>'%('\n'.join(header), '\n'.join(content))
open(cfile, 'wb').write(html.encode(enc))
a['href'] = ('file:' if iswindows else '') + cfile
open(index, 'wb').write(unicode(isoup).encode(enc))
def build_index(self):
index = DefaultProfile.build_index(self)
self.fetch_content(index)

View File

@ -1,90 +0,0 @@
##
## web2lrf profile to download articles from Barrons.com
## can download subscriber-only content if username and
## password are supplied.
##
'''
'''
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class Barrons(DefaultProfile):
title = 'Barron\'s'
max_recursions = 3
max_articles_per_feed = 50
needs_subscription = True
timefmt = ' [%a, %b %d, %Y]'
html_description = True
no_stylesheets = False
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
html2lrf_options = [('--ignore-tables'),('--base-font-size=10')]
##delay = 1
## Don't grab articles more than 7 days old
oldest_article = 7
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove any reprint info from the body of the article.
(r'<hr size.*?<p', lambda match : '<p'),
## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
]
]
def get_browser(self):
br = DefaultProfile.get_browser()
if self.username is not None and self.password is not None:
br.open('http://commerce.barrons.com/auth/login')
br.select_form(name='login_form')
br['user'] = self.username
br['password'] = self.password
br.submit()
return br
## Use the print version of a page when available.
def print_version(self, url):
return url.replace('/article/', '/article_print/')
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
def get_feeds(self):
return [
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
]
## Logout of website
## NOT CURRENTLY WORKING
# def cleanup(self):
# try:
# self.browser.set_debug_responses(True)
# import sys, logging
# logger = logging.getLogger("mechanize")
# logger.addHandler(logging.StreamHandler(sys.stdout))
# logger.setLevel(logging.INFO)
# res = self.browser.open('http://online.barrons.com/logout')
# except:
# import traceback
# traceback.print_exc()

View File

@ -1,45 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Fetch the BBC.
'''
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class BBC(DefaultProfile):
title = 'The BBC'
max_recursions = 2
timefmt = ' [%a, %d %b, %Y]'
no_stylesheets = True
preprocess_regexps = \
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove footer from individual stories
(r'<div class=.footer.>.*?Published',
lambda match : '<p></p><div class="footer">Published'),
# Add some style info in place of disabled stylesheet
(r'<link.*?type=.text/css.*?>', lambda match :
'''<style type="text/css">
.headline {font-size: x-large;}
.fact { padding-top: 10pt }
</style>'''),
]
]
def print_version(self, url):
return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
def get_feeds(self):
src = self.browser.open('http://news.bbc.co.uk/1/hi/help/3223484.stm').read()
soup = BeautifulSoup(src[src.index('<html'):])
feeds = []
ul = soup.find('ul', attrs={'class':'rss'})
for link in ul.findAll('a'):
feeds.append((link.string, link['href']))
return feeds

View File

@ -1,46 +0,0 @@
import re, time
from calibre.ebooks.lrf.web.profiles import DefaultProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class ChristianScienceMonitor(DefaultProfile):
title = 'Christian Science Monitor'
max_recursions = 2
max_articles_per_feed = 20
no_stylesheets = True
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<body.*?<div id="story"', lambda match : '<body><div id="story"'),
(r'<div class="pubdate">.*?</div>', lambda m: ''),
(r'Full HTML version of this story which may include photos, graphics, and related links.*</body>',
lambda match : '</body>'),
]]
def parse_feeds(self):
soup = BeautifulSoup(self.browser.open('http://www.csmonitor.com/textedition'))
articles = {}
feed = []
for tag in soup.findAll(['h2', 'p']):
if tag.name == 'h2':
title = self.tag_to_string(tag)
feed = []
articles[title] = feed
elif tag.has_key('class') and tag['class'] == 'story':
a = tag.find('a')
if a is not None and a.has_key('href'):
feed.append({
'title': self.tag_to_string(a),
'url' : 'http://www.csmonitor.com'+a['href'],
'date' : time.strftime('%d %b'),
'content' : '',
})
a.extract()
feed[-1]['description'] = self.tag_to_string(tag).strip()
return articles

View File

@ -1,51 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Profile to download CNN
'''
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class CNN(DefaultProfile):
title = 'CNN'
max_recursions = 2
timefmt = ' [%d %b %Y]'
html_description = True
no_stylesheets = True
oldest_article = 15
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [
(r'<head>.*?<title', lambda match : '<head><title'),
(r'</title>.*?</head>', lambda match : '</title></head>'),
(r'<body.*?<\!\-\-Article.*?>', lambda match : ''),
(r'<\!\-\-Article End\-\->.*?</body>', lambda match : '</body>'),
(r'(</h\d>)<ul>.*?</ul>', lambda match : match.group(1)), # drop story highlights
(r'<h2>(.*?)</h2><h1>(.*?)</h1>', lambda match : '<h1>' + match.group(1) + '</h1><h2>' + match.group(2) + '</h2>'), # sports uses h2 for main title and h1 for subtitle (???) switch these around
(r'<span class="cnnEmbeddedMosLnk">.*?</span>', lambda match : ''), # drop 'watch more' links
(r'(<div class="cnnstorybody">).*?(<p)', lambda match : match.group(1) + match.group(2)), # drop sports photos
(r'</?table.*?>|</?tr.*?>|</?td.*?>', lambda match : ''), # drop table formatting
(r'<div class="cnnendofstorycontent".*?>.*?</div>', lambda match : ''), # drop extra business links
(r'<a href="#TOP">.*?</a>', lambda match : '') # drop business 'to top' link
] ]
def print_version(self, url):
return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url
def get_feeds(self):
return [
('Top News', 'http://rss.cnn.com/rss/cnn_topstories.rss'),
('World', 'http://rss.cnn.com/rss/cnn_world.rss'),
('U.S.', 'http://rss.cnn.com/rss/cnn_us.rss'),
('Sports', 'http://rss.cnn.com/rss/si_topstories.rss'),
('Business', 'http://rss.cnn.com/rss/money_latest.rss'),
('Politics', 'http://rss.cnn.com/rss/cnn_allpolitics.rss'),
('Law', 'http://rss.cnn.com/rss/cnn_law.rss'),
('Technology', 'http://rss.cnn.com/rss/cnn_tech.rss'),
('Science & Space', 'http://rss.cnn.com/rss/cnn_space.rss'),
('Health', 'http://rss.cnn.com/rss/cnn_health.rss'),
('Entertainment', 'http://rss.cnn.com/rss/cnn_showbiz.rss'),
('Education', 'http://rss.cnn.com/rss/cnn_education.rss'),
('Offbeat', 'http://rss.cnn.com/rss/cnn_offbeat.rss'),
('Most Popular', 'http://rss.cnn.com/rss/cnn_mostpopular.rss')
]

View File

@ -1,73 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Fetch The Economist.
'''
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Economist(DefaultProfile):
title = 'The Economist'
timefmt = ' [%d %b %Y]'
max_recursions = 2
TITLES = [
'The world this week',
'Letters',
'Briefings',
'Special reports',
'Britain',
'Europe',
'United States',
'The Americas',
'Middle East and Africa',
'Asia',
'International',
'Business',
'Finance and economics',
'Science and technology',
'Books and arts',
'Indicators'
]
preprocess_regexps = \
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove advert
(r'<noscript.*?</noscript>', lambda match: ''),
(r'<\!--\s+INVISIBLE SKIP .*?-->.*?<\!--\s+INVISIBLE SKIP .*?\s+-->',
lambda match : ''),
(r'<img.+?alt="AP".+?/>', lambda match: ''),
]
]
def __init__(self, logger, verbose=False, username=None, password=None):
DefaultProfile.__init__(self, username, password)
self.browser = None # Needed as otherwise there are timeouts while fetching actual articles
def print_version(self, url):
return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '')
def get_feeds(self):
src = self.browser.open('http://economist.com/rss/').read()
soup = BeautifulSoup(src)
feeds = []
for ul in soup.findAll('ul'):
lis = ul.findAll('li')
try:
title, link = lis[0], lis[1]
except IndexError:
continue
title = title.string
if title:
title = title.strip()
if title not in self.__class__.TITLES:
continue
a = link.find('a')
feeds.append((title, a['href'].strip()))
return feeds

View File

@ -1,28 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Profile to download FAZ.net
'''
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class FazNet(DefaultProfile):
title = 'FAZ NET'
max_recursions = 2
html_description = True
max_articles_per_feed = 30
preprocess_regexps = [
(re.compile(r'Zum Thema</span>.*?</BODY>', re.IGNORECASE | re.DOTALL),
lambda match : ''),
]
def get_feeds(self):
return [ ('FAZ.NET', 'http://www.faz.net/s/Rub/Tpl~Epartner~SRss_.xml') ]
def print_version(self, url):
return url.replace('.html?rss_aktuell', '~Afor~Eprint.html')

View File

@ -1,36 +0,0 @@
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class JerusalemPost(DefaultProfile):
title = 'Jerusalem Post'
max_recursions = 2
max_articles_per_feed = 10
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<BODY.*?>.*?<!-- start Entries -->', lambda match : '<BODY><!-- start Entries -->'),
(r'<!-- end Entries -->.*?</BODY>', lambda match : '</BODY>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<div class="apple-rss-article apple-rss-read" onclick=.*?<div class="apple-rss-article-body">', lambda match : ''),
(r'<img src=\'/images/logo_NWAnews.gif\' alt=\'NWAnews.com :: Northwest Arkansas\' News Source\'.*?>', lambda match : ''),
(r'<img src=\'/images/logo_adg.gif\'.*?>', lambda match : ''),
(r'<P CLASS="smallprint">.*?</body>', lambda match : '</body>'),
]
]
def get_feeds(self):
return [ ('Front Page', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333346'),
('Israel News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156'),
('Middle East News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498'),
('International News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144'),
('Editorials', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333211'),
]
def print_version(self, url):
return ('http://www.jpost.com/servlet/Satellite?cid=' + url.rpartition('&')[2] + '&pagename=JPost%2FJPArticle%2FPrinter')

View File

@ -1,44 +0,0 @@
'''
Profile to download Jutarnji.hr by Valloric
'''
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class Jutarnji(DefaultProfile):
title = 'Jutarnji'
max_recursions = 2
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 80
html_description = True
no_stylesheets = True
preprocess_regexps = [
(re.compile(r'<body.*?<span class="vijestnaslov">', re.IGNORECASE | re.DOTALL), lambda match : '<body><span class="vijestnaslov">'),
(re.compile(r'</div>.*?</td>', re.IGNORECASE | re.DOTALL), lambda match : '</div></td>'),
(re.compile(r'<a name="addComment.*?</body>', re.IGNORECASE | re.DOTALL), lambda match : '</body>'),
(re.compile(r'<br>', re.IGNORECASE | re.DOTALL), lambda match : ''),
]
## Getting the print version
def print_version(self, url):
return 'http://www.jutarnji.hr/ispis_clanka.jl?artid=' + url[len(url)-9:len(url)-3]
## Comment out the feeds you don't want retrieved.
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
## If you want one of these at the top, append a space in front of the name.
def get_feeds(self):
return [
(' Naslovnica', 'http://www.jutarnji.hr/rss'),
('Sport', 'http://www.jutarnji.hr/sport/rss'),
('Novac', 'http://www.jutarnji.hr/novac/rss'),
('Kultura i zivot', 'http://www.jutarnji.hr/kultura_i_zivot/rss'),
('Automoto', 'http://www.jutarnji.hr/auto_moto/rss'),
('Hi-Tech', 'http://www.jutarnji.hr/kultura_i_zivot/hi-tech/rss'),
('Dom i nekretnine', 'http://www.jutarnji.hr/nekretnine/rss'),
]

View File

@ -1,91 +0,0 @@
## Copyright (C) 2008 B.Scott Wxby [bswxby] &
## Copyright (C) 2007 David Chen SonyReader<at>DaveChen<dot>org
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## Version 0.3-2008_2_28
## Based on WIRED.py by David Chen, 2007, and newsweek.py, bbc.py, nytimes.py by Kovid Goyal
## https://calibre.kovidgoyal.net/wiki/UserProfiles
##
## Usage:
## >web2lrf --user-profile nasa.py
## Comment out the RSS feeds you don't want in the last section below
##
## Output:
## NASA [YearMonthDate Time].lrf
##
'''
Custom User Profile to download RSS News Feeds and Articles from Wired.com
'''
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class NASA(DefaultProfile):
title = 'NASA'
max_recursions = 2
timefmt = ' [%Y%b%d %H%M]'
html_description = True
no_stylesheets = True
## Don't grab articles more than 30 days old
oldest_article = 30
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Fix the encoding to UTF-8
(r'<meta http-equiv="Content-Type" content="text/html; charset=(\S+)"', lambda match : match.group().replace(match.group(1), 'UTF-8')),
## Remove any banners/links/ads/cruft before the body of the article.
(r'<body.*?((<div id="article_body">)|(<div id="st-page-maincontent">)|(<div id="containermain">)|(<p class="ap-story-p">)|(<!-- img_nav -->))', lambda match: '<body><div>'),
## Remove any links/ads/comments/cruft from the end of the body of the article.
(r'((<!-- end article content -->)|(<div id="st-custom-afterpagecontent">)|(<p class="ap-story-p">&copy;)|(<div class="entry-footer">)|(<div id="see_also">)|(<p>Via <a href=)|(<div id="ss_nav">)).*?</html>', lambda match : '</div></body></html>'),
## Correctly embed in-line images by removing the surrounding javascript that will be ignored in the conversion
(r'<a.*?onclick.*?>.*?(<img .*?>)', lambda match: match.group(1),),
## This removes header and footer information from each print version.
(re.compile(r'<!-- Top Header starts -->.*?<!-- Body starts -->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
(re.compile(r'<hr align="center" width="200"><p align="center">.*?<!-- Press Release standard text ends -->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
(re.compile(r'<!-- Top Header starts -->.*?<!---->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
## This removes the "download image" of various sizes from the Image of the day.
(re.compile(r'(?is)<div id="download_image_box_print">.*?<div id="caption_region_print">'), lambda match : '<New Stuff>'),
]
]
## NASA's print pages differ only by the ending "_prt.htm", so I've replaced them below.
def print_version(self, url):
return url.replace('.html', '_prt.htm')
## Comment out the feeds you don't want retrieved.
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
## If you want one of these at the top, append a space in front of the name.
def get_feeds(self):
return [
(' Breaking News', 'http://www.nasa.gov/rss/breaking_news.rss'),
('Image of the Day', 'http://www.nasa.gov/rss/image_of_the_day.rss'),
('Moon and Mars Exploration', 'http://www.nasa.gov/rss/moon_mars.rss'),
('Shuttle and Station News', 'http://www.nasa.gov/rss/shuttle_station.rss'),
('Solar System News', 'http://www.nasa.gov/rss/solar_system.rss'),
('Universe News', 'http://www.nasa.gov/rss/universe.rss'),
('Earth News', 'http://www.nasa.gov/rss/earth.rss'),
('Aeronautics News', 'http://www.nasa.gov/rss/aeronautics.rss'),
]

View File

@ -1,37 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Profile to download Newsweek
'''
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class Newsweek(DefaultProfile):
title = 'Newsweek'
max_recursions = 2
timefmt = ' [%d %b %Y]'
html_description = True
oldest_article = 15
def print_version(self, url):
if not url.endswith('/'):
url += '/'
return url + 'output/print'
def get_feeds(self):
return [
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
('Politics', 'http://feeds.newsweek.com/headlines/politics'),
('Health', 'http://feeds.newsweek.com/headlines/health'),
('Business', 'http://feeds.newsweek.com/headlines/business'),
('Science and Technology', 'http://feeds.newsweek.com/headlines/technology/science'),
('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
('Society', 'http://feeds.newsweek.com/newsweek/society'),
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
]

View File

@ -1,56 +0,0 @@
'''
Profile to download Jutarnji.hr
'''
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class NewYorker(DefaultProfile):
title = 'The New Yorker'
max_recursions = 2
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 20
html_description = True
no_stylesheets = True
oldest_article = 14
## Getting the print version
def print_version(self, url):
return url + '?printable=true'
preprocess_regexps = [
(re.compile(r'<body.*?<!-- start article content -->', re.IGNORECASE | re.DOTALL), lambda match : '<body>'),
(re.compile(r'<div class="utils"'),
lambda match : '<div class="utils" style="display:none"'),
(re.compile(r'<div class="articleRailLinks"'),
lambda match : '<div class="articleRailLinks" style="display:none"'),
(re.compile(r'<div id="keywords"'),
lambda match : '<div id="keywords" style="display:none"'),
(re.compile(r'<!-- end article body -->.*?</body>', re.IGNORECASE | re.DOTALL), lambda match : '</body>'),
(re.compile(r'<!-- start video content -->.*?<!-- end video content -->', re.IGNORECASE | re.DOTALL), lambda match : '<!-- start video content --><!-- end video content -->'),
]
## Comment out the feeds you don't want retrieved.
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
## If you want one of these at the top, append a space in front of the name.
def get_feeds(self):
return [
('Online Only', 'http://feeds.newyorker.com/services/rss/feeds/online.xml'),
('The Talk Of The Town', 'http://feeds.newyorker.com/services/rss/feeds/talk.xml'),
('Reporting and Essays', 'http://feeds.newyorker.com/services/rss/feeds/reporting.xml'),
('Arts and Culture', 'http://feeds.newyorker.com/services/rss/feeds/arts.xml'),
('Humor', 'http://feeds.newyorker.com/services/rss/feeds/humor.xml'),
('Fiction and Poetry', 'http://feeds.newyorker.com/services/rss/feeds/fiction.xml'),
('Comment', 'http://feeds.newyorker.com/services/rss/feeds/comment.xml'),
('The Financial Page', 'http://feeds.newyorker.com/services/rss/feeds/financial.xml'),
('Politics', 'http://feeds.newyorker.com/services/rss/feeds/politics.xml'),
('Movies', 'http://feeds.newyorker.com/services/rss/feeds/movies.xml'),
('Books', 'http://feeds.newyorker.com/services/rss/feeds/books.xml'),
('Tables For Two', 'http://feeds.newyorker.com/services/rss/feeds/tables.xml'),
]

View File

@ -1,24 +0,0 @@
## By Lorenzo goehr, lorenzogoehr@hotmail.com for Libprs500 by Kovid Goyal
from calibre.ebooks.lrf.web.profiles import DefaultProfile
import re
class NewYorkReviewOfBooks(DefaultProfile):
title = 'New York Review of Books'
max_recursions = 2
max_articles_per_feed = 50
html_description = True
no_stylesheets = True
def get_feeds(self):
return [ ('Current Issue', 'http://feeds.feedburner.com/nybooks') ]
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [
(r'<meta http-equiv="Content-Type" content="text/html; charset=(\S+)"', lambda match : match.group().replace(match.group(1), 'UTF-8')),
(r'<body.*?((<div id="article_body">)|(<div id="st-page-maincontent">)|(<div id="containermain">)|(<p class="ap-story-p">)|(<!-- img_nav -->))', lambda match: '<body><div>'),
(r'((<!-- end article content -->)|(<div id="st-custom-afterpagecontent">)|(<p class="ap-story-p">&copy;)|(<div class="entry-footer">)|(<div id="see_also">)|(<p>Via <a href=)|(<div id="ss_nav">)).*?</html>', lambda match : '</div></body></html>'),
(r'<div class="nav">.*?<h2>', lambda match: '<h2>'),
(r'<table.*?>.*?(<img .*?/table>)', lambda match: match.group(1),), ] ]

View File

@ -1,100 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Profile to download the New York Times
'''
import re, time
from calibre.ebooks.lrf.web.profiles import DefaultProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class NYTimes(DefaultProfile):
title = 'The New York Times'
timefmt = ' [%a, %d %b, %Y]'
needs_subscription = True
max_recursions = 2
recommended_frequency = 1
encoding = 'cp1252'
html2lrf_options = ['--base-font-size=0']
preprocess_regexps = \
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove header bar
(r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'),
(r'<div class="articleTools">.*></ul>', lambda match : ''),
# Remove footer bar
(r'<\!-- end \#article -->.*', lambda match : '</body></html>'),
(r'<div id="footer">.*', lambda match : '</body></html>'),
]
]
def get_browser(self):
br = DefaultProfile.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.nytimes.com/auth/login')
br.select_form(name='login')
br['USERID'] = self.username
br['PASSWORD'] = self.password
br.submit()
return br
def get_feeds(self):
src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read()
soup = BeautifulSoup(src[src.index('<html'):])
feeds = []
for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',
'Dining & Wine', 'Home & Garden', 'Multimedia',
'Most E-mailed Articles',
'Automobiles', 'Fashion & Style', 'Television News',
'Education']:
feeds.append((link['title'], link['href'].replace('graphics8', 'www')))
return feeds
def parse_feeds(self):
if self.lrf: # The new feed causes the SONY Reader to crash
return DefaultProfile.parse_feeds(self)
src = self.browser.open('http://www.nytimes.com/pages/todayspaper/index.html').read().decode('cp1252')
soup = BeautifulSoup(src)
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=False)).strip()
articles = {}
key = None
for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline']}):
if div['class'] == 'section-headline':
key = feed_title(div)
articles[key] = []
elif div['class'] in ['story', 'story headline']:
a = div.find('a', href=True)
if not a:
continue
url = self.print_version(a['href'])
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = time.strftime('%a, %d %b', time.localtime())
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
feed = key if key is not None else 'Uncategorized'
if not articles.has_key(feed):
articles[feed] = []
articles[feed].append(
dict(title=title, url=url, date=pubdate, description=description,
content=''))
return articles
def print_version(self, url):
return url + '?&pagewanted=print'

View File

@ -1,40 +0,0 @@
##
## web2lrf profile to download articles from Portfolio.com
##
'''
'''
from calibre.ebooks.lrf.web.profiles import FullContentProfile
class Portfolio(FullContentProfile):
title = 'Portfolio'
max_articles_per_feed = 50
timefmt = ' [%a, %b %d, %Y]'
html_description = True
no_stylesheets = True
html2lrf_options = ['--ignore-tables']
##delay = 1
oldest_article = 30
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire
def get_feeds(self):
return [
('Business Travel', 'http://feeds.portfolio.com/portfolio/businesstravel'),
('Careers', 'http://feeds.portfolio.com/portfolio/careers'),
('Culture and Lifestyle', 'http://feeds.portfolio.com/portfolio/cultureandlifestyle'),
('Executives','http://feeds.portfolio.com/portfolio/executives'),
('News and Markets', 'http://feeds.portfolio.com/portfolio/news'),
('Business Spin', 'http://feeds.portfolio.com/portfolio/businessspin'),
('Capital', 'http://feeds.portfolio.com/portfolio/capital'),
('Daily Brief', 'http://feeds.portfolio.com/portfolio/dailybrief'),
('Market Movers', 'http://feeds.portfolio.com/portfolio/marketmovers'),
('Mixed Media', 'http://feeds.portfolio.com/portfolio/mixedmedia'),
('Odd Numbers', 'http://feeds.portfolio.com/portfolio/oddnumbers'),
('Playbook', 'http://feeds.portfolio.com/portfolio/playbook'),
('Tech Observer', 'http://feeds.portfolio.com/portfolio/thetechobserver'),
('World According to ...', 'http://feeds.portfolio.com/portfolio/theworldaccordingto'),
]

View File

@ -1,39 +0,0 @@
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class Reuters(DefaultProfile):
title = 'Reuters'
max_recursions = 2
max_articles_per_feed = 10
html_description = True
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
##(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<body>.*?<div class="contentBand">', lambda match : '<body>'),
(r'<h3>Share:</h3>.*?</body>', lambda match : '<!-- END:: Shared Module id=36615 --></body>'),
(r'<div id="atools" class="articleTools">.*?<div class="linebreak">', lambda match : '<div class="linebreak">'),
]
]
def get_feeds(self):
return [ ('Top Stories', 'http://feeds.reuters.com/reuters/topNews?format=xml'),
('US News', 'http://feeds.reuters.com/reuters/domesticNews?format=xml'),
('World News', 'http://feeds.reuters.com/reuters/worldNews?format=xml'),
('Politics News', 'http://feeds.reuters.com/reuters/politicsNews?format=xml'),
('Science News', 'http://feeds.reuters.com/reuters/scienceNews?format=xml'),
('Environment News', 'http://feeds.reuters.com/reuters/Environment?format=xml'),
('Technology News', 'http://feeds.reuters.com/reuters/technologyNews?format=xml'),
('Oddly Enough News', 'http://feeds.reuters.com/reuters/oddlyEnoughNews?format=xml')
]
def print_version(self, url):
return ('http://www.reuters.com/article/id' + url + '?sp=true')

View File

@ -1,36 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Fetch Spiegel Online.
'''
from calibre.ebooks.lrf.web.profiles import DefaultProfile
import re
class SpiegelOnline(DefaultProfile):
title = 'Spiegel Online'
timefmt = ' [ %Y-%m-%d %a]'
max_recursions = 2
max_articles_per_feed = 40
use_pubdate = False
no_stylesheets = True
preprocess_regexps = \
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove Zum Thema footer
(r'<div class="spArticleCredit.*?</body>', lambda match: '</body>'),
]
]
def get_feeds(self):
return [ ('Spiegel Online', 'http://www.spiegel.de/schlagzeilen/rss/0,5291,,00.xml') ]
def print_version(self,url):
tokens = url.split(',')
tokens[-2:-2] = ['druck|']
return ','.join(tokens).replace('|,','-')

View File

@ -1,36 +0,0 @@
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class UnitedPressInternational(DefaultProfile):
title = 'United Press International'
max_recursions = 2
max_articles_per_feed = 15
html2lrf_options = ['--override-css= "H1 {font-family: Arial; font-weight: bold; color: #000000; size: 10pt;}"']
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<body onload=.*?>.*?<a href="http://www.upi.com">', lambda match : '<body style="font: 8pt arial;">'),
##(r'<div class=\'headerDIV\'><h2><a style="color: #990000;" href="http://www.upi.com/NewsTrack/Top_News/">Top News</a></h2></div>.*?<br clear="all">', lambda match : ''),
(r'<script src="http://www.g.*?>.*?</body>', lambda match : ''),
(r'<span style="font: 16pt arial', lambda match : '<span style="font: 12pt arial'),
]
]
def get_feeds(self):
return [ ('Top Stories', 'http://www.upi.com/rss/NewsTrack/Top_News/'),
('Science', 'http://www.upi.com/rss/NewsTrack/Science/'),
('Heatlth', 'http://www.upi.com/rss/NewsTrack/Health/'),
('Quirks', 'http://www.upi.com/rss/NewsTrack/Quirks/'),
]
def print_version(self, url):
return (url + 'print_view/')

View File

@ -1,43 +0,0 @@
'''
Profile to download Jutarnji.hr by Valloric
'''
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class USAToday(DefaultProfile):
title = 'USA Today'
max_recursions = 2
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 20
html_description = True
#no_stylesheets = True
preprocess_regexps = [
(re.compile(r'<BODY.*?<!--Article Goes Here-->', re.IGNORECASE | re.DOTALL), lambda match : '<BODY>'),
(re.compile(r'<!--Article End-->.*?</BODY>', re.IGNORECASE | re.DOTALL), lambda match : '</BODY>'),
]
## Getting the print version
def print_version(self, url):
return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url
## Comment out the feeds you don't want retrieved.
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
## If you want one of these at the top, append a space in front of the name.
def get_feeds(self):
return [
(' Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'),
('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'),
('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'),
('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
]

View File

@ -1,44 +0,0 @@
import re
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class WashingtonPost(DefaultProfile):
title = 'Washington Post'
max_recursions = 2
max_articles_per_feed = 20
use_pubdate = False
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<body.*?>.*?.correction {', lambda match : '<body><style>.correction {'),
(r'<span class="display:none;" name="pubDate".*?>.*?</body>', lambda match : '<body>'),
]
]
def get_feeds(self):
return [ ('Today\'s Highlights', 'http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/03/24/LI2005032400102.xml'),
('Politics', 'http://www.washingtonpost.com/wp-dyn/rss/politics/index.xml'),
('Nation', 'http://www.www.washingtonpost.com/wp-dyn/rss/nation/index.xml'),
('World', 'http://www.washingtonpost.com/wp-dyn/rss/world/index.xml'),
('Business', 'http://www.washingtonpost.com/wp-dyn/rss/business/index.xml'),
('Technology', 'http://www.washingtonpost.com/wp-dyn/rss/technology/index.xml'),
('Health', 'http://www.washingtonpost.com/wp-dyn/rss/health/index.xml'),
('Education', 'http://www.washingtonpost.com/wp-dyn/rss/education/index.xml'),
('Editorials', 'http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/05/30/LI2005053000331.xml'),
]
def print_version(self, url):
return (url.rpartition('.')[0] + '_pf.html')

View File

@ -1,108 +0,0 @@
##
## web2lrf profile to download articles from WSJ.com
## can download subscriber-only content if username and
## password are supplied.
##
'''
'''
import re
from urlparse import urlparse
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class WallStreetJournal(DefaultProfile):
title = 'The Wall Street Journal'
max_recursions = 2
needs_subscription = True
no_stylesheets = False
max_articles_per_feed = 10
timefmt = ' [%a, %b %d, %Y]'
html2lrf_options = ['--ignore-tables']
## Don't grab articles more than 7 days old
oldest_article = 7
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
]
]
def get_browser(self):
br = DefaultProfile.get_browser()
if self.username is not None and self.password is not None:
br.open('http://online.wsj.com/login')
br.select_form(name='login_form')
br['user'] = self.username
br['password'] = self.password
br.submit()
return br
def print_version(self, url):
article = urlparse(url).path.rpartition('/')[-1]
return 'http://online.wsj.com/article_print/'+article
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire
def get_feeds(self):
return [
#('Most Emailed - Day', 'http://online.wsj.com/xml/rss/3_7030.xml'),
#('Most Emailed - Week', 'http://online.wsj.com/xml/rss/3_7253.xml'),
#('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'),
(' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'),
(' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'),
# ('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'),
(' Today\'s Newspaper - Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'),
(' Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'),
(' Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'),
(' Today\'s Newspaper - Personal Journal', 'http://online.wsj.com/xml/rss/3_7208.xml'),
(' Today\'s Newspaper - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7209.xml'),
('Opinion', 'http://online.wsj.com/xml/rss/3_7041.xml'),
('News - U.S.: What\'s News', 'http://online.wsj.com/xml/rss/3_7011.xml'),
('News - U.S. Business', 'http://online.wsj.com/xml/rss/3_7014.xml'),
('News - Europe: What\'s News', 'http://online.wsj.com/xml/rss/3_7012.xml'),
('News - Asia: What\'s News', 'http://online.wsj.com/xml/rss/3_7013.xml'),
('News - World News', 'http://online.wsj.com/xml/rss/3_7085.xml'),
('News - Economy', 'http://online.wsj.com/xml/rss/3_7086.xml'),
('News - Earnings', 'http://online.wsj.com/xml/rss/3_7088.xml'),
('News - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
('News - Law', 'http://online.wsj.com/xml/rss/3_7091.xml'),
('News - Media & Marketing', 'http://online.wsj.com/xml/rss/3_7020.xml'),
('Technology - What\'s News', 'http://online.wsj.com/xml/rss/3_7015.xml'),
('Technology - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
('Technology - Telecommunications', 'http://online.wsj.com/xml/rss/3_7095.xml'),
('Technology - E-commerce/Media', 'http://online.wsj.com/xml/rss/3_7096.xml'),
('Technology - Asia', 'http://online.wsj.com/xml/rss/3_7097.xml'),
('Technology - Europe', 'http://online.wsj.com/xml/rss/3_7098.xml'),
('Markets - News', 'http://online.wsj.com/xml/rss/3_7031.xml'),
('Markets - Europe News', 'http://online.wsj.com/xml/rss/3_7101.xml'),
('Markets - Asia News', 'http://online.wsj.com/xml/rss/3_7102.xml'),
('Markets - Deals & Deal Makers', 'http://online.wsj.com/xml/rss/3_7099.xml'),
('Markets - Hedge Funds', 'http://online.wsj.com/xml/rss/3_7199.xml'),
('Personal Journal', 'http://online.wsj.com/xml/rss/3_7200.xml'),
('Personal Journal - Money', 'http://online.wsj.com/xml/rss/3_7104.xml'),
('Personal Journal - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
('Personal Journal - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
('Personal Journal - Homes', 'http://online.wsj.com/xml/rss/3_7105.xml'),
('Personal Journal - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
('Personal Journal - Careers', 'http://online.wsj.com/xml/rss/3_7107.xml'),
# ('Personal Journal - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
('Weekend & Leisure', 'http://online.wsj.com/xml/rss/3_7201.xml'),
('Weekend & Leisure - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7202.xml'),
('Weekend & Leisure - Arts & Entertainment', 'http://online.wsj.com/xml/rss/3_7177.xml'),
('Weekend & Leisure - Books', 'http://online.wsj.com/xml/rss/3_7203.xml'),
# ('Weekend & Leisure - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
# ('Weekend & Leisure - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'),
]
## Logout of website
## NOT CURRENTLY WORKING
# def cleanup(self):
# self.browser.open('http://commerce.wsj.com/auth/postlogout')

View File

@ -1,26 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Fetch Die Zeit.
'''
from calibre.ebooks.lrf.web.profiles import DefaultProfile
class ZeitNachrichten(DefaultProfile):
title = 'Die Zeit Nachrichten'
timefmt = ' [%d %b %Y]'
max_recursions = 2
max_articles_per_feed = 40
html_description = True
no_stylesheets = True
encoding = 'latin1'
def get_feeds(self):
return [ ('Zeit.de', 'http://newsfeed.zeit.de/news/index') ]
def print_version(self,url):
return url.replace('http://www.zeit.de/', 'http://images.zeit.de/text/').replace('?from=rss', '')

View File

@ -29,6 +29,10 @@ class MOBIOutput(OutputFormatPlugin):
),
])
recommendations = set([
('dont_justify', True, OptionRecommendation.HIGH),
])
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
from calibre.ebooks.mobi.writer import PALM_MAX_IMAGE_SIZE, MobiWriter

View File

@ -22,7 +22,6 @@ entry_points = {
'web2disk = calibre.web.fetch.simple:main',
'feeds2disk = calibre.web.feeds.main:main',
'calibre-server = calibre.library.server:main',
'web2lrf = calibre.ebooks.lrf.web.convert_from:main',
'lrf2lrs = calibre.ebooks.lrf.lrfparser:main',
'lrs2lrf = calibre.ebooks.lrf.lrs.convert_from:main',
'isbndb = calibre.ebooks.metadata.isbndb:main',

View File

@ -19,7 +19,7 @@ from calibre import browser, __appname__, iswindows, \
strftime, __version__, preferred_encoding
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.lrf import entity_to_unicode
from calibre import entity_to_unicode
from calibre.web import Recipe
from calibre.ebooks import render_html
from calibre.ebooks.metadata.toc import TOC

View File

@ -46,13 +46,12 @@ recipe_modules = ['recipe_' + r for r in (
import re, imp, inspect, time, os
from calibre.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe, AutomaticNewsRecipe
from calibre.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.path import path
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre import __appname__, english_sort
basic_recipes = (BasicNewsRecipe, AutomaticNewsRecipe, CustomIndexRecipe, DefaultProfile, FullContentProfile)
basic_recipes = (BasicNewsRecipe, AutomaticNewsRecipe, CustomIndexRecipe)
basic_recipe_names = (i.__name__ for i in basic_recipes)
@ -83,7 +82,7 @@ def compile_recipe(src):
Compile the code in src and return the first object that is a recipe or profile.
@param src: Python source code
@type src: string
@return: Recipe/Profile class or None, if no such class was found in C{src}
@return: Recipe class or None, if no such class was found in C{src}
'''
global _tdir, _crep
if _tdir is None or not os.path.exists(_tdir):
@ -97,7 +96,6 @@ def compile_recipe(src):
src = re.sub(r'from __future__.*', '', src)
f = open(temp, 'wb')
src = 'from %s.web.feeds.news import BasicNewsRecipe, AutomaticNewsRecipe\n'%__appname__ + src
src = 'from %s.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile\n'%__appname__ + src
src = '# coding: utf-8\n' + src
src = 'from __future__ import with_statement\n' + src
@ -108,7 +106,7 @@ def compile_recipe(src):
module = imp.load_module(temp.namebase, *module)
classes = inspect.getmembers(module,
lambda x : inspect.isclass(x) and \
issubclass(x, (DefaultProfile, BasicNewsRecipe)) and \
issubclass(x, (BasicNewsRecipe,)) and \
x not in basic_recipes)
if not classes:
return None
@ -119,11 +117,10 @@ def compile_recipe(src):
def get_builtin_recipe(title):
'''
Return a builtin recipe/profile class whose title == C{title} or None if no such
recipe exists. Also returns a flag that is True iff the found recipe is really
an old-style Profile.
recipe exists.
@type title: string
@rtype: class or None, boolean
@rtype: class or None
'''
for r in recipes:
if r.title == title: