mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KG pre-6.44 release
This commit is contained in:
commit
b4481eaf76
@ -1,27 +1,41 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2010 Starson17'
|
||||
'''
|
||||
fudzilla.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Fudzilla(BasicNewsRecipe):
|
||||
title = u'Fudzilla'
|
||||
__author__ = 'Darko Miletic'
|
||||
__author__ = 'Starson17'
|
||||
language = 'en'
|
||||
|
||||
description = 'Tech news'
|
||||
oldest_article = 7
|
||||
remove_javascript = True
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
|
||||
feeds = [ (u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1')]
|
||||
|
||||
def print_version(self, url):
|
||||
nurl = url.replace('http://www.fudzilla.com/index.php','http://www.fudzilla.com/index2.php')
|
||||
nmain, nsep, nrest = nurl.partition('&Itemid=')
|
||||
return nmain + '&pop=1&page=0&Itemid=1'
|
||||
remove_tags_before = dict(name='div', attrs={'class':['padding']})
|
||||
|
||||
remove_tags = [dict(name='td', attrs={'class':['left','right']}),
|
||||
dict(name='div', attrs={'id':['toolbar','buttons']}),
|
||||
dict(name='div', attrs={'class':['artbannersxtd','back_button']}),
|
||||
dict(name='span', attrs={'class':['pathway']}),
|
||||
dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}),
|
||||
dict(name='table', attrs={'class':['headlines']}),
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1')
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<p class="MsoNormal"> Welcome.*</p> ', re.DOTALL|re.IGNORECASE), lambda match: '')
|
||||
]
|
||||
|
@ -84,4 +84,4 @@ class JournalofHospitalMedicine(BasicNewsRecipe):
|
||||
for img in soup.findAll('img', src=True):
|
||||
img['src'] = img['src'].replace('tfig', 'nfig')
|
||||
return soup
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ class CHMMetadataReader(MetadataReaderPlugin):
|
||||
description = _('Read metadata from %s files') % 'CHM'
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.chm import get_metadata
|
||||
from calibre.ebooks.chm.metadata import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
|
||||
|
@ -12,7 +12,6 @@ from cStringIO import StringIO
|
||||
from struct import unpack
|
||||
|
||||
from calibre.devices.usbms.driver import USBMS
|
||||
from calibre.utils.logging import Log
|
||||
|
||||
class KINDLE(USBMS):
|
||||
|
||||
@ -116,7 +115,6 @@ class KINDLE(USBMS):
|
||||
path_map.pop(id)
|
||||
return path_map, book_ext
|
||||
|
||||
log = Log()
|
||||
storage = get_storage()
|
||||
path_map, book_ext = resolve_bookmark_paths(storage, path_map)
|
||||
|
||||
@ -358,4 +356,4 @@ class Bookmark():
|
||||
self.book_length = int(raw[idx+len('bookLength')+1:idx+len('bookLength')+1+length])
|
||||
|
||||
else:
|
||||
print "unsupported bookmark_extension: %s" % bookmark_extension
|
||||
print "unsupported bookmark_extension: %s" % self.bookmark_extension
|
||||
|
@ -25,15 +25,16 @@ class CHMInput(InputFormatPlugin):
|
||||
rdr = CHMReader(chm_path, log)
|
||||
log.debug('Extracting CHM to %s' % output_dir)
|
||||
rdr.extract_content(output_dir)
|
||||
self._chm_reader = rdr
|
||||
return rdr.hhc_path
|
||||
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.metadata.chm import get_metadata_
|
||||
from calibre.ebooks.chm.metadata import get_metadata_from_reader
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
log.debug('Processing CHM...')
|
||||
with TemporaryDirectory('chm2oeb') as tdir:
|
||||
with TemporaryDirectory('_chm2oeb') as tdir:
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
@ -48,8 +49,9 @@ class CHMInput(InputFormatPlugin):
|
||||
log.debug('stream.name=%s' % stream.name)
|
||||
mainname = self._chmtohtml(tdir, chm_name, no_images, log)
|
||||
mainpath = os.path.join(tdir, mainname)
|
||||
#raw_input()
|
||||
|
||||
metadata = get_metadata_(tdir)
|
||||
metadata = get_metadata_from_reader(self._chm_reader)
|
||||
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
@ -170,6 +172,7 @@ class CHMInput(InputFormatPlugin):
|
||||
if isinstance(node.tag, basestring):
|
||||
from calibre.ebooks.chm.reader import match_string
|
||||
|
||||
chapter_path = None
|
||||
if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
|
||||
for child in node:
|
||||
if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
|
||||
|
157
src/calibre/ebooks/chm/metadata.py
Normal file
157
src/calibre/ebooks/chm/metadata.py
Normal file
@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata import string_to_authors, MetaInformation
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.ptempfile import TemporaryFile
|
||||
|
||||
def _clean(s):
|
||||
return s.replace(u'\u00a0', u' ')
|
||||
|
||||
def _detag(tag):
|
||||
str = u""
|
||||
for elem in tag:
|
||||
if hasattr(elem, "contents"):
|
||||
str += _detag(elem)
|
||||
else:
|
||||
str += _clean(elem)
|
||||
return str
|
||||
|
||||
|
||||
def _metadata_from_table(soup, searchfor):
|
||||
td = soup.find('td', text=re.compile(searchfor, flags=re.I))
|
||||
if td is None:
|
||||
return None
|
||||
td = td.parent
|
||||
# there appears to be multiple ways of structuring the metadata
|
||||
# on the home page. cue some nasty special-case hacks...
|
||||
if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
|
||||
meta = _detag(td.findNextSibling('td'))
|
||||
return re.sub('^:', '', meta).strip()
|
||||
else:
|
||||
meta = _detag(td)
|
||||
return re.sub(r'^[^:]+:', '', meta).strip()
|
||||
|
||||
def _metadata_from_span(soup, searchfor):
|
||||
span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)})
|
||||
if span is None:
|
||||
return None
|
||||
# this metadata might need some cleaning up still :/
|
||||
return _detag(span.renderContents().strip())
|
||||
|
||||
def _get_authors(soup):
|
||||
aut = (_metadata_from_span(soup, r'author')
|
||||
or _metadata_from_table(soup, r'^\s*by\s*:?\s+'))
|
||||
ans = [_('Unknown')]
|
||||
if aut is not None:
|
||||
ans = string_to_authors(aut)
|
||||
return ans
|
||||
|
||||
def _get_publisher(soup):
|
||||
return (_metadata_from_span(soup, 'imprint')
|
||||
or _metadata_from_table(soup, 'publisher'))
|
||||
|
||||
def _get_isbn(soup):
|
||||
return (_metadata_from_span(soup, 'isbn')
|
||||
or _metadata_from_table(soup, 'isbn'))
|
||||
|
||||
def _get_comments(soup):
|
||||
date = (_metadata_from_span(soup, 'cwdate')
|
||||
or _metadata_from_table(soup, 'pub date'))
|
||||
pages = ( _metadata_from_span(soup, 'pages')
|
||||
or _metadata_from_table(soup, 'pages'))
|
||||
try:
|
||||
# date span can have copyright symbols in it...
|
||||
date = date.replace(u'\u00a9', '').strip()
|
||||
# and pages often comes as '(\d+ pages)'
|
||||
pages = re.search(r'\d+', pages).group(0)
|
||||
return u'Published %s, %s pages.' % (date, pages)
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
def _get_cover(soup, rdr):
|
||||
ans = None
|
||||
try:
|
||||
ans = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
|
||||
except TypeError:
|
||||
# meeehh, no handy alt-tag goodness, try some hackery
|
||||
# the basic idea behind this is that in general, the cover image
|
||||
# has a height:width ratio of ~1.25, whereas most of the nav
|
||||
# buttons are decidedly less than that.
|
||||
# what we do in this is work out that ratio, take 1.25 off it and
|
||||
# save the absolute value when we sort by this value, the smallest
|
||||
# one is most likely to be the cover image, hopefully.
|
||||
r = {}
|
||||
for img in soup('img'):
|
||||
try:
|
||||
r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src']
|
||||
except KeyError:
|
||||
# interestingly, occasionally the only image without height
|
||||
# or width attrs is the cover...
|
||||
r[0] = img['src']
|
||||
l = r.keys()
|
||||
l.sort()
|
||||
ans = r[l[0]]
|
||||
# this link comes from the internal html, which is in a subdir
|
||||
if ans is not None:
|
||||
try:
|
||||
ans = rdr.GetFile(ans)
|
||||
except:
|
||||
ans = rdr.root + "/" + ans
|
||||
try:
|
||||
ans = rdr.GetFile(ans)
|
||||
except:
|
||||
ans = None
|
||||
if ans is not None:
|
||||
from PIL import Image
|
||||
from cStringIO import StringIO
|
||||
buf = StringIO()
|
||||
try:
|
||||
Image.open(StringIO(ans)).convert('RGB').save(buf, 'JPEG')
|
||||
ans = buf.getvalue()
|
||||
except:
|
||||
ans = None
|
||||
return ans
|
||||
|
||||
|
||||
def get_metadata_from_reader(rdr):
|
||||
raw = rdr.GetFile(rdr.home)
|
||||
home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0])
|
||||
|
||||
title = rdr.title
|
||||
authors = _get_authors(home)
|
||||
mi = MetaInformation(title, authors)
|
||||
publisher = _get_publisher(home)
|
||||
if publisher:
|
||||
mi.publisher = publisher
|
||||
isbn = _get_isbn(home)
|
||||
if isbn:
|
||||
mi.isbn = isbn
|
||||
comments = _get_comments(home)
|
||||
if comments:
|
||||
mi.comments = comments
|
||||
|
||||
cdata = _get_cover(home, rdr)
|
||||
if cdata is not None:
|
||||
mi.cover_data = ('jpg', cdata)
|
||||
|
||||
return mi
|
||||
|
||||
def get_metadata(stream):
|
||||
with TemporaryFile('_chm_metadata.chm') as fname:
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(stream.read())
|
||||
from calibre.ebooks.chm.reader import CHMReader
|
||||
rdr = CHMReader(fname, default_log)
|
||||
return get_metadata_from_reader(rdr)
|
@ -135,8 +135,13 @@ class CHMReader(CHMFile):
|
||||
if guess_mimetype(path)[0] == ('text/html'):
|
||||
data = self._reformat(data)
|
||||
f.write(data)
|
||||
#subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
|
||||
self._extracted = True
|
||||
files = os.listdir(output_dir)
|
||||
if self.hhc_path not in files:
|
||||
for f in files:
|
||||
if f.lower() == self.hhc_path.lower():
|
||||
self.hhc_path = f
|
||||
break
|
||||
|
||||
def _reformat(self, data):
|
||||
try:
|
||||
|
@ -935,11 +935,11 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
|
||||
return ids
|
||||
|
||||
def get_formats(id):
|
||||
book_data = db.get_data_as_dict(ids=[id])[0]
|
||||
formats = book_data['formats']
|
||||
formats = db.formats(id, index_is_id=True)
|
||||
fmts = []
|
||||
for format in formats:
|
||||
fmts.append(format.rpartition('.')[2])
|
||||
if formats:
|
||||
for format in formats.split(','):
|
||||
fmts.append(format.lower())
|
||||
return fmts
|
||||
|
||||
def generate_annotation_paths(ids, db, device):
|
||||
@ -1031,9 +1031,9 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
|
||||
if user_notes[location]['text']:
|
||||
annotations.append('<b>Location %d • %s</b><br />%s<br />' % \
|
||||
(user_notes[location]['displayed_location'],
|
||||
user_notes[location]['type'],
|
||||
user_notes[location]['text'] if \
|
||||
user_notes[location]['type'] == 'Note' else \
|
||||
user_notes[location]['type'],
|
||||
user_notes[location]['text'] if \
|
||||
user_notes[location]['type'] == 'Note' else \
|
||||
'<i>%s</i>' % user_notes[location]['text']))
|
||||
else:
|
||||
annotations.append('<b>Location %d • %s</b><br />' % \
|
||||
|
@ -20,7 +20,7 @@ What formats does |app| support conversion to/from?
|
||||
|app| supports the conversion of many input formats to many output formats.
|
||||
It can convert every input format in the following list, to every output format.
|
||||
|
||||
*Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT
|
||||
*Input Formats:* CBZ, CBR, CBC, CHM, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT
|
||||
|
||||
*Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TCR, TXT
|
||||
|
||||
@ -191,7 +191,7 @@ Library Management
|
||||
|
||||
What formats does |app| read metadata from?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|app| reads metadata from the following formats: LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI
|
||||
|app| reads metadata from the following formats: CHM, LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI
|
||||
|
||||
Where are the book files stored?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -103,7 +103,7 @@ class RecipeInput(InputFormatPlugin):
|
||||
ro.download()
|
||||
self.recipe_object = ro
|
||||
|
||||
for key, val in recipe.conversion_options.items():
|
||||
for key, val in self.recipe_object.conversion_options.items():
|
||||
setattr(opts, key, val)
|
||||
|
||||
for f in os.listdir('.'):
|
||||
|
@ -623,7 +623,7 @@ class BasicNewsRecipe(Recipe):
|
||||
def download(self):
|
||||
'''
|
||||
Download and pre-process all articles from the feeds in this recipe.
|
||||
This method should be called only one on a particular Recipe instance.
|
||||
This method should be called only once on a particular Recipe instance.
|
||||
Calling it more than once will lead to undefined behavior.
|
||||
@return: Path to index.html
|
||||
@rtype: string
|
||||
@ -1358,3 +1358,26 @@ class AutomaticNewsRecipe(BasicNewsRecipe):
|
||||
if self.use_embedded_content:
|
||||
self.web2disk_options.keep_only_tags = []
|
||||
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)
|
||||
|
||||
class DownloadedNewsRecipe(BasicNewsRecipe):
|
||||
|
||||
def get_downloaded_recipe(self):
|
||||
'Return path on local filesystem to downloaded recipe'
|
||||
raise NotImplementedError
|
||||
|
||||
def download(self):
|
||||
self.log('Fetching downloaded recipe')
|
||||
rpath = self.get_downloaded_recipe()
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
zf = ZipFile(rpath)
|
||||
zf.extractall()
|
||||
zf.close()
|
||||
from calibre.web.feeds.recipes import compile_recipe
|
||||
from glob import glob
|
||||
try:
|
||||
recipe = compile_recipe(open(glob('*.downloaded_recipe')[0],
|
||||
'rb').read())
|
||||
self.conversion_options = recipe.conversion_options
|
||||
except:
|
||||
self.log.exception('Failed to compile downloaded recipe')
|
||||
return os.path.abspath('index.html')
|
||||
|
Loading…
x
Reference in New Issue
Block a user