mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
feeds2lrf
This commit is contained in:
parent
fb53f18a8f
commit
00b0cf46fc
@ -18,7 +18,7 @@ __docformat__ = "epytext"
|
||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
__appname__ = 'libprs500'
|
||||
|
||||
import sys, os, logging, mechanize, locale, cStringIO, re, subprocess, textwrap
|
||||
import sys, os, logging, mechanize, locale, copy, cStringIO, re, subprocess, textwrap
|
||||
from gettext import GNUTranslations
|
||||
from math import floor
|
||||
from optparse import OptionParser as _OptionParser
|
||||
@ -143,38 +143,64 @@ class OptionParser(_OptionParser):
|
||||
raise Exception(msg)
|
||||
_OptionParser.error(self, msg)
|
||||
|
||||
def merge(self, parser):
|
||||
'''
|
||||
Add options from parser to self. In case of conflicts, confilicting options from
|
||||
parser are skipped.
|
||||
'''
|
||||
opts = list(parser.option_list)
|
||||
groups = list(parser.option_groups)
|
||||
|
||||
def merge_options(options, container):
|
||||
for opt in copy.deepcopy(options):
|
||||
if not self.has_option(opt.get_opt_string()):
|
||||
container.add_option(opt)
|
||||
|
||||
merge_options(opts, self)
|
||||
|
||||
for group in groups:
|
||||
g = self.add_option_group(group.title)
|
||||
merge_options(group.option_list, g)
|
||||
|
||||
def subsume(self, group_name, msg=''):
|
||||
'''
|
||||
Move all existing options into a subgroup named
|
||||
C{group_name} with description C{msg}.
|
||||
'''
|
||||
opts = list(self.option_list)
|
||||
groups = list(self.option_groups)
|
||||
exclude = []
|
||||
|
||||
for opt in opts:
|
||||
ops = opt.get_opt_string()
|
||||
if ops in ('--help', '--version'):
|
||||
exclude.append(opt)
|
||||
else:
|
||||
self.remove_option(ops)
|
||||
for group in groups:
|
||||
for opt in group.option_list:
|
||||
opts.append(opt)
|
||||
group.remove_option(opt)
|
||||
|
||||
opts = [opt for opt in self.options_iter() if opt.get_opt_string() not in ('--version', '--help')]
|
||||
self.option_groups = []
|
||||
subgroup = self.add_option_group(group_name, msg)
|
||||
for opt in opts:
|
||||
if opt in exclude:
|
||||
continue
|
||||
self.remove_option(opt.get_opt_string())
|
||||
subgroup.add_option(opt)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def options_iter(self):
|
||||
for opt in self.option_list:
|
||||
if str(opt).strip():
|
||||
yield opt
|
||||
for gr in self.option_groups:
|
||||
for opt in gr.option_list:
|
||||
if str(opt).strip():
|
||||
yield opt
|
||||
|
||||
def option_by_dest(self, dest):
|
||||
for opt in self.options_iter():
|
||||
if opt.dest == dest:
|
||||
return opt
|
||||
|
||||
def merge_options(self, lower, upper):
|
||||
'''
|
||||
Merge options in lower and upper option lists into upper.
|
||||
Default values in upper are overriden by
|
||||
non default values in lower.
|
||||
'''
|
||||
for dest in lower.__dict__.keys():
|
||||
if not upper.__dict__.has_key(dest):
|
||||
continue
|
||||
opt = self.option_by_dest(dest)
|
||||
if lower.__dict__[dest] != opt.default and \
|
||||
upper.__dict__[dest] == opt.default:
|
||||
upper.__dict__[dest] = lower.__dict__[dest]
|
||||
|
||||
|
||||
def load_library(name, cdll):
|
||||
|
16
src/libprs500/ebooks/lrf/feeds/__init__.py
Normal file
16
src/libprs500/ebooks/lrf/feeds/__init__.py
Normal file
@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
71
src/libprs500/ebooks/lrf/feeds/convert_from.py
Normal file
71
src/libprs500/ebooks/lrf/feeds/convert_from.py
Normal file
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''
|
||||
Convert web feeds to LRF files.
|
||||
'''
|
||||
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from libprs500.ebooks.lrf.html.convert_from import process_file
|
||||
from libprs500.web.feeds.main import option_parser as feeds_option_parser
|
||||
from libprs500.web.feeds.main import run_recipe
|
||||
from libprs500.ptempfile import PersistentTemporaryDirectory
|
||||
from libprs500 import sanitize_file_name
|
||||
|
||||
import sys, os, time
|
||||
|
||||
def option_parser():
|
||||
parser = feeds_option_parser()
|
||||
parser.remove_option('--output-dir')
|
||||
parser.remove_option('--lrf')
|
||||
parser.subsume('FEEDS2DISK OPTIONS', _('Options to control the behavior of feeds2disk'))
|
||||
lrf_parser = lrf_option_parser('')
|
||||
lrf_parser.subsume('HTML2LRF OPTIONS', _('Options to control the behavior of html2lrf'))
|
||||
parser.merge(lrf_parser)
|
||||
return parser
|
||||
|
||||
def main(args=sys.argv, notification=None, handler=None):
|
||||
parser = option_parser()
|
||||
opts, args = parser.parse_args(args)
|
||||
opts.lrf = True
|
||||
|
||||
if len(args) != 2 and opts.feeds is None:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
recipe_arg = args[1] if len(args) > 1 else None
|
||||
|
||||
tdir = PersistentTemporaryDirectory('_feeds2lrf')
|
||||
opts.output_dir = tdir
|
||||
|
||||
recipe = run_recipe(opts, recipe_arg, parser, notification=notification, handler=handler)
|
||||
|
||||
htmlfile = os.path.join(tdir, 'index.html')
|
||||
if not os.access(htmlfile, os.R_OK):
|
||||
raise RuntimeError(_('Fetching of recipe failed: ')+recipe_arg)
|
||||
|
||||
lparser = lrf_option_parser('')
|
||||
ropts = lparser.parse_args(['html2lrf']+recipe.html2lrf_options)[0]
|
||||
parser.merge_options(ropts, opts)
|
||||
|
||||
if not opts.output:
|
||||
ext = '.lrs' if opts.lrs else '.lrf'
|
||||
fname = recipe.title + time.strftime(recipe.timefmt)+ext
|
||||
opts.output = os.path.join(os.getcwd(), sanitize_file_name(fname))
|
||||
print 'Generating LRF...'
|
||||
process_file(htmlfile, opts)
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -1715,7 +1715,7 @@ def process_file(path, options, logger=None):
|
||||
|
||||
tpath = ''
|
||||
try_opf(path, options, logger)
|
||||
if options.cover:
|
||||
if getattr(options, 'cover', None):
|
||||
options.cover = os.path.expanduser(options.cover)
|
||||
if not os.path.isabs(options.cover):
|
||||
options.cover = os.path.join(dirpath, options.cover)
|
||||
@ -1750,7 +1750,7 @@ def process_file(path, options, logger=None):
|
||||
options.title = default_title
|
||||
|
||||
for prop in ('author', 'author_sort', 'title', 'title_sort', 'publisher', 'freetext'):
|
||||
val = getattr(options, prop)
|
||||
val = getattr(options, prop, None)
|
||||
if val and not isinstance(val, unicode):
|
||||
soup = BeautifulSoup(val)
|
||||
setattr(options, prop, unicode(soup))
|
||||
@ -1822,13 +1822,14 @@ def try_opf(path, options, logger):
|
||||
break
|
||||
if opf is None:
|
||||
return
|
||||
|
||||
dirpath = os.path.dirname(os.path.abspath(opf))
|
||||
opf = OPFReader(open(opf, 'rb'), dirpath)
|
||||
try:
|
||||
title = opf.title
|
||||
if title and not options.title:
|
||||
if title and not getattr(options, 'title', None):
|
||||
options.title = title
|
||||
if options.author == 'Unknown':
|
||||
if getattr(options, 'author', 'Unknown') == 'Unknown':
|
||||
if opf.authors:
|
||||
options.author = ', '.join(opf.authors)
|
||||
if opf.author_sort:
|
||||
@ -1837,12 +1838,12 @@ def try_opf(path, options, logger):
|
||||
publisher = opf.publisher
|
||||
if publisher:
|
||||
options.publisher = publisher
|
||||
if not options.category:
|
||||
if not getattr(options, 'category', None):
|
||||
category = opf.category
|
||||
if category:
|
||||
options.category = category
|
||||
if not options.cover or options.use_metadata_cover:
|
||||
orig_cover = options.cover
|
||||
if not getattr(options, 'cover', None) or options.use_metadata_cover:
|
||||
orig_cover = getattr(options, 'cover', None)
|
||||
options.cover = None
|
||||
cover = opf.cover
|
||||
if cover:
|
||||
@ -1865,10 +1866,10 @@ def try_opf(path, options, logger):
|
||||
break
|
||||
except:
|
||||
continue
|
||||
if not options.cover and orig_cover is not None:
|
||||
if not getattr(options, 'cover', None) and orig_cover is not None:
|
||||
options.cover = orig_cover
|
||||
options.spine = [i.href for i in opf.spine.items()]
|
||||
if not hasattr(options, 'toc') or options.toc is None:
|
||||
if not getattr(options, 'toc', None):
|
||||
options.toc = opf.toc
|
||||
except Exception:
|
||||
logger.exception('Failed to process opf file')
|
||||
|
@ -503,7 +503,7 @@ class OPFReader(OPF):
|
||||
stream.close()
|
||||
self.manifest = Manifest(self.soup, dir)
|
||||
self.spine = Spine(self.soup, self.manifest)
|
||||
self.toc = TOC()
|
||||
self.toc = TOC(base_path=dir)
|
||||
self.toc.read_from_opf(self)
|
||||
self.cover_data = (None, None)
|
||||
|
||||
@ -554,13 +554,15 @@ class OPFCreator(MetaInformation):
|
||||
self.manifest = rentries
|
||||
|
||||
def create_manifest_from_files_in(self, files_and_dirs):
|
||||
#self.base_path = os.path.commonprefix(files_and_dirs)
|
||||
entries = []
|
||||
|
||||
def dodir(dir):
|
||||
for root, dirs, files in os.walk(dir):
|
||||
for name in files:
|
||||
path = os.path.join(root, name)
|
||||
entries.append((path, None))
|
||||
if os.path.isfile(path):
|
||||
entries.append((path, None))
|
||||
|
||||
for i in files_and_dirs:
|
||||
if os.path.isdir(i):
|
||||
|
@ -166,6 +166,7 @@ def setup_completion(fatal_errors):
|
||||
from libprs500.ebooks.mobi.reader import option_parser as mobioeb
|
||||
from libprs500.web.feeds.main import option_parser as feeds2disk
|
||||
from libprs500.web.feeds.recipes import titles as feed_titles
|
||||
from libprs500.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
|
||||
|
||||
f = open_file('/etc/bash_completion.d/libprs500')
|
||||
|
||||
@ -191,6 +192,7 @@ def setup_completion(fatal_errors):
|
||||
f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
|
||||
f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc']))
|
||||
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
|
||||
f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
|
||||
f.write('''
|
||||
_prs500_ls()
|
||||
{
|
||||
|
@ -79,7 +79,7 @@ class Feed(object):
|
||||
self.oldest_article = oldest_article
|
||||
|
||||
for item in entries:
|
||||
if len(self.articles) > max_articles_per_feed:
|
||||
if len(self.articles) >= max_articles_per_feed:
|
||||
break
|
||||
self.parse_article(item)
|
||||
|
||||
|
@ -41,7 +41,7 @@ Available builtin recipes are:
|
||||
p.remove_option('--base-dir')
|
||||
p.remove_option('--verbose')
|
||||
p.remove_option('--max-files')
|
||||
p.subsume('WEB2DISK OPTIONS', 'Options to control web2disk (used to fetch websites linked from feeds)')
|
||||
p.subsume('WEB2DISK OPTIONS', _('Options to control web2disk (used to fetch websites linked from feeds)'))
|
||||
|
||||
p.add_option('--feeds', default=None,
|
||||
help=_('''Specify a list of feeds to download. For example:
|
||||
@ -50,7 +50,7 @@ If you specify this option, any argument to %prog is ignored and a default recip
|
||||
p.add_option('--verbose', default=False, action='store_true',
|
||||
help=_('''Be more verbose while processing.'''))
|
||||
p.add_option('--title', default=None,
|
||||
help='The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.')
|
||||
help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
|
||||
p.add_option('--username', default=None, help=_('Username for sites that require a login to access content.'))
|
||||
p.add_option('--password', default=None, help=_('Password for sites that require a login to access content.'))
|
||||
p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
|
||||
@ -61,7 +61,9 @@ If you specify this option, any argument to %prog is ignored and a default recip
|
||||
p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false',
|
||||
help=_('Dont show the progress bar'))
|
||||
p.add_option('--debug', action='store_true', default=False,
|
||||
help='Very verbose output, useful for debugging.')
|
||||
help=_('Very verbose output, useful for debugging.'))
|
||||
p.add_option('--test', action='store_true', default=False,
|
||||
help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
|
||||
|
||||
return p
|
||||
|
||||
@ -72,10 +74,10 @@ def simple_progress_bar(percent, msg):
|
||||
def no_progress_bar(percent, msg):
|
||||
print msg
|
||||
|
||||
def main(args=sys.argv, notification=None, handler=None):
|
||||
p = option_parser()
|
||||
opts, args = p.parse_args(args)
|
||||
|
||||
class RecipeError(Exception):
|
||||
pass
|
||||
|
||||
def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
||||
if notification is None:
|
||||
from libprs500.terminfo import TerminalController, ProgressBar
|
||||
term = TerminalController(sys.stdout)
|
||||
@ -89,18 +91,15 @@ def main(args=sys.argv, notification=None, handler=None):
|
||||
else:
|
||||
notification = no_progress_bar
|
||||
|
||||
if len(args) != 2 and opts.feeds is None:
|
||||
p.print_help()
|
||||
return 1
|
||||
|
||||
recipe = None
|
||||
if opts.feeds is not None:
|
||||
recipe = BasicNewsRecipe
|
||||
else:
|
||||
try:
|
||||
if os.access(args[1], os.R_OK):
|
||||
if os.access(recipe_arg, os.R_OK):
|
||||
try:
|
||||
recipe = compile_recipe(open(args[1]).read())
|
||||
recipe = compile_recipe(open(recipe_arg).read())
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
@ -108,15 +107,13 @@ def main(args=sys.argv, notification=None, handler=None):
|
||||
else:
|
||||
raise Exception('not file')
|
||||
except:
|
||||
recipe = get_builtin_recipe(args[1])
|
||||
recipe = get_builtin_recipe(recipe_arg)
|
||||
if recipe is None:
|
||||
recipe = compile_recipe(args[1])
|
||||
recipe = compile_recipe(recipe_arg)
|
||||
|
||||
if recipe is None:
|
||||
p.print_help()
|
||||
print
|
||||
print args[1], 'is an invalid recipe'
|
||||
return 1
|
||||
raise RecipeError(recipe_arg+ ' is an invalid recipe')
|
||||
|
||||
|
||||
if handler is None:
|
||||
from libprs500 import ColoredFormatter
|
||||
@ -125,9 +122,23 @@ def main(args=sys.argv, notification=None, handler=None):
|
||||
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
|
||||
logging.getLogger('feeds2disk').addHandler(handler)
|
||||
|
||||
recipe = recipe(opts, p, notification)
|
||||
recipe = recipe(opts, parser, notification)
|
||||
if not os.path.exists(recipe.output_dir):
|
||||
os.makedirs(recipe.output_dir)
|
||||
recipe.download()
|
||||
|
||||
|
||||
return recipe
|
||||
|
||||
def main(args=sys.argv, notification=None, handler=None):
|
||||
p = option_parser()
|
||||
opts, args = p.parse_args(args)
|
||||
|
||||
if len(args) != 2 and opts.feeds is None:
|
||||
p.print_help()
|
||||
return 1
|
||||
recipe_arg = args[1] if len(args) > 1 else None
|
||||
run_recipe(opts, recipe_arg, p, notification=notification, handler=handler)
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -165,6 +165,8 @@ class BasicNewsRecipe(object):
|
||||
'''
|
||||
if not self.feeds:
|
||||
raise NotImplementedError
|
||||
if self.test:
|
||||
return self.feeds[:2]
|
||||
return self.feeds
|
||||
|
||||
@classmethod
|
||||
@ -225,10 +227,13 @@ class BasicNewsRecipe(object):
|
||||
@param parser: Command line option parser. Used to intelligently merge options.
|
||||
@param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
|
||||
'''
|
||||
for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug'):
|
||||
for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'):
|
||||
setattr(self, attr, getattr(options, attr))
|
||||
self.output_dir = os.path.abspath(self.output_dir)
|
||||
|
||||
if options.test:
|
||||
self.max_articles_per_feed = 2
|
||||
self.simultaneous_downloads = min(4, self.simultaneous_downloads)
|
||||
|
||||
self.logger = logging.getLogger('feeds2disk')
|
||||
|
||||
if self.debug:
|
||||
@ -288,10 +293,12 @@ class BasicNewsRecipe(object):
|
||||
self.simultaneous_downloads = 1
|
||||
|
||||
self.navbar = templates.NavBarTemplate()
|
||||
self.max_articles_per_feed -= 1
|
||||
self.html2lrf_options.append('--use-spine')
|
||||
self.failed_downloads = []
|
||||
self.partial_failures = []
|
||||
|
||||
|
||||
|
||||
|
||||
def _postprocess_html(self, soup):
|
||||
if self.extra_css is not None:
|
||||
@ -383,6 +390,8 @@ class BasicNewsRecipe(object):
|
||||
def build_index(self):
|
||||
self.report_progress(0, _('Fetching feeds...'))
|
||||
feeds = self.parse_feeds()
|
||||
if self.test:
|
||||
feeds = feeds[:2]
|
||||
self.has_single_feed = len(feeds) == 1
|
||||
|
||||
index = os.path.join(self.output_dir, 'index.html')
|
||||
@ -460,13 +469,14 @@ class BasicNewsRecipe(object):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
|
||||
mi.author_sort = __appname__
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
opf = OPFCreator(dir, mi)
|
||||
|
||||
|
||||
manifest = ['feed_%d'%i for i in range(len(feeds))]
|
||||
manifest.append('index.html')
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
|
@ -41,7 +41,7 @@ class Newsweek(BasicNewsRecipe):
|
||||
'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
|
||||
]
|
||||
|
||||
extra_css = '#content { font:serif 1.2em; }'
|
||||
extra_css = '#content { font:serif 12pt; }\n.story {font:12pt}\n.HorizontalHeader {font:18pt}\n.deck {font:16pt}'
|
||||
keep_only_tags = [dict(name='div', id='content')]
|
||||
|
||||
remove_tags = [
|
||||
@ -54,11 +54,6 @@ class Newsweek(BasicNewsRecipe):
|
||||
recursions = 1
|
||||
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
|
||||
|
||||
# For testing
|
||||
#feeds = feeds[3:5]
|
||||
#max_articles_per_feed = 2
|
||||
|
||||
|
||||
|
||||
def postprocess_html(self, soup):
|
||||
divs = list(soup.findAll('div', 'pagination'))
|
||||
|
@ -37,10 +37,12 @@ def basename(url):
|
||||
return res
|
||||
|
||||
def save_soup(soup, target):
|
||||
nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
||||
meta = soup.find('meta', content=True)
|
||||
if meta and 'charset' in meta['content']:
|
||||
meta.replaceWith(nm)
|
||||
ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
||||
nm = ns.find('meta')
|
||||
metas = soup.findAll('meta', content=True)
|
||||
for meta in metas:
|
||||
if 'charset' in meta['content']:
|
||||
meta.replaceWith(nm)
|
||||
f = codecs.open(target, 'w', 'utf-8')
|
||||
f.write(unicode(soup))
|
||||
f.close()
|
||||
|
Loading…
x
Reference in New Issue
Block a user