From bac7e6b78c38b86a58cab81de20d06f5010eda1f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 4 Mar 2010 22:10:36 -0700 Subject: [PATCH 1/3] ... --- src/calibre/web/feeds/input.py | 2 +- src/calibre/web/feeds/news.py | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/calibre/web/feeds/input.py b/src/calibre/web/feeds/input.py index 51aaeb3e4b..f3a7d01917 100644 --- a/src/calibre/web/feeds/input.py +++ b/src/calibre/web/feeds/input.py @@ -103,7 +103,7 @@ class RecipeInput(InputFormatPlugin): ro.download() self.recipe_object = ro - for key, val in recipe.conversion_options.items(): + for key, val in self.recipe_object.conversion_options.items(): setattr(opts, key, val) for f in os.listdir('.'): diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index d0c9d941e3..d07c135abd 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -623,7 +623,7 @@ class BasicNewsRecipe(Recipe): def download(self): ''' Download and pre-process all articles from the feeds in this recipe. - This method should be called only one on a particular Recipe instance. + This method should be called only once on a particular Recipe instance. Calling it more than once will lead to undefined behavior. @return: Path to index.html @rtype: string @@ -1358,3 +1358,26 @@ class AutomaticNewsRecipe(BasicNewsRecipe): if self.use_embedded_content: self.web2disk_options.keep_only_tags = [] return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds) + +class DownloadedNewsRecipe(BasicNewsRecipe): + + def get_downloaded_recipe(self): + 'Return path on local filesystem to downloaded recipe' + raise NotImplementedError + + def download(self): + self.log('Fetching downloaded recipe') + rpath = self.get_downloaded_recipe() + from calibre.utils.zipfile import ZipFile + zf = ZipFile(rpath) + zf.extractall() + zf.close() + from calibre.web.feeds.recipes import compile_recipe + from glob import glob + try: + recipe = compile_recipe(open(glob('*.downloaded_recipe')[0], + 'rb').read()) + self.conversion_options = recipe.conversion_options + except: + self.log.exception('Failed to compile downloaded recipe') + return os.path.abspath('index.html') From c7e8c889a4c00bfcebb95d44ffff54bb32abdec8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 5 Mar 2010 09:49:04 -0700 Subject: [PATCH 2/3] Fix Fudzilla --- resources/recipes/fudzilla.recipe | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/resources/recipes/fudzilla.recipe b/resources/recipes/fudzilla.recipe index e7f3d99fe9..821488ad0a 100644 --- a/resources/recipes/fudzilla.recipe +++ b/resources/recipes/fudzilla.recipe @@ -1,27 +1,41 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2010 Starson17' ''' fudzilla.com ''' +import re from calibre.web.feeds.news import BasicNewsRecipe class Fudzilla(BasicNewsRecipe): title = u'Fudzilla' - __author__ = 'Darko Miletic' + __author__ = 'Starson17' language = 'en' description = 'Tech news' oldest_article = 7 + remove_javascript = True max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - feeds = [ (u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1')] - def print_version(self, url): - nurl = url.replace('http://www.fudzilla.com/index.php','http://www.fudzilla.com/index2.php') - nmain, nsep, nrest = nurl.partition('&Itemid=') - return nmain + '&pop=1&page=0&Itemid=1' + remove_tags_before = dict(name='div', attrs={'class':['padding']}) + + remove_tags = [dict(name='td', attrs={'class':['left','right']}), + dict(name='div', attrs={'id':['toolbar','buttons']}), + dict(name='div', attrs={'class':['artbannersxtd','back_button']}), + dict(name='span', attrs={'class':['pathway']}), + dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}), + dict(name='table', attrs={'class':['headlines']}), + ] + + feeds = [ + (u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1') + ] + + preprocess_regexps = [ + (re.compile(r'

Welcome.*

', re.DOTALL|re.IGNORECASE), lambda match: '') + ] From eae90e2ef409e1a8fffa9bd904beb9764edffc85 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 5 Mar 2010 10:49:28 -0700 Subject: [PATCH 3/3] Implement #4825 (CHM format) --- src/calibre/customize/builtins.py | 2 +- src/calibre/ebooks/chm/input.py | 9 +- src/calibre/ebooks/chm/metadata.py | 157 +++++++++++++++++++++++++++++ src/calibre/ebooks/chm/reader.py | 7 +- src/calibre/manual/faq.rst | 4 +- 5 files changed, 172 insertions(+), 7 deletions(-) create mode 100644 src/calibre/ebooks/chm/metadata.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 0ba197fac3..391b7d22e6 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -110,7 +110,7 @@ class CHMMetadataReader(MetadataReaderPlugin): description = _('Read metadata from %s files') % 'CHM' def get_metadata(self, stream, ftype): - from calibre.ebooks.metadata.chm import get_metadata + from calibre.ebooks.chm.metadata import get_metadata return get_metadata(stream) diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py index e2a270f2b8..3f0aa21f08 100644 --- a/src/calibre/ebooks/chm/input.py +++ b/src/calibre/ebooks/chm/input.py @@ -25,15 +25,16 @@ class CHMInput(InputFormatPlugin): rdr = CHMReader(chm_path, log) log.debug('Extracting CHM to %s' % output_dir) rdr.extract_content(output_dir) + self._chm_reader = rdr return rdr.hhc_path def convert(self, stream, options, file_ext, log, accelerators): - from calibre.ebooks.metadata.chm import get_metadata_ + from calibre.ebooks.chm.metadata import get_metadata_from_reader from calibre.customize.ui import plugin_for_input_format log.debug('Processing CHM...') - with TemporaryDirectory('chm2oeb') as tdir: + with TemporaryDirectory('_chm2oeb') as tdir: html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) @@ -48,8 +49,9 @@ class CHMInput(InputFormatPlugin): log.debug('stream.name=%s' % stream.name) mainname = self._chmtohtml(tdir, chm_name, no_images, log) mainpath = os.path.join(tdir, mainname) + #raw_input() - metadata = get_metadata_(tdir) + metadata = get_metadata_from_reader(self._chm_reader) odi = options.debug_pipeline options.debug_pipeline = None @@ -170,6 +172,7 @@ class CHMInput(InputFormatPlugin): if isinstance(node.tag, basestring): from calibre.ebooks.chm.reader import match_string + chapter_path = None if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'): for child in node: if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'): diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py new file mode 100644 index 0000000000..7386d54658 --- /dev/null +++ b/src/calibre/ebooks/chm/metadata.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re + +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import string_to_authors, MetaInformation +from calibre.utils.logging import default_log +from calibre.ptempfile import TemporaryFile + +def _clean(s): + return s.replace(u'\u00a0', u' ') + +def _detag(tag): + str = u"" + for elem in tag: + if hasattr(elem, "contents"): + str += _detag(elem) + else: + str += _clean(elem) + return str + + +def _metadata_from_table(soup, searchfor): + td = soup.find('td', text=re.compile(searchfor, flags=re.I)) + if td is None: + return None + td = td.parent + # there appears to be multiple ways of structuring the metadata + # on the home page. cue some nasty special-case hacks... + if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I): + meta = _detag(td.findNextSibling('td')) + return re.sub('^:', '', meta).strip() + else: + meta = _detag(td) + return re.sub(r'^[^:]+:', '', meta).strip() + +def _metadata_from_span(soup, searchfor): + span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)}) + if span is None: + return None + # this metadata might need some cleaning up still :/ + return _detag(span.renderContents().strip()) + +def _get_authors(soup): + aut = (_metadata_from_span(soup, r'author') + or _metadata_from_table(soup, r'^\s*by\s*:?\s+')) + ans = [_('Unknown')] + if aut is not None: + ans = string_to_authors(aut) + return ans + +def _get_publisher(soup): + return (_metadata_from_span(soup, 'imprint') + or _metadata_from_table(soup, 'publisher')) + +def _get_isbn(soup): + return (_metadata_from_span(soup, 'isbn') + or _metadata_from_table(soup, 'isbn')) + +def _get_comments(soup): + date = (_metadata_from_span(soup, 'cwdate') + or _metadata_from_table(soup, 'pub date')) + pages = ( _metadata_from_span(soup, 'pages') + or _metadata_from_table(soup, 'pages')) + try: + # date span can have copyright symbols in it... + date = date.replace(u'\u00a9', '').strip() + # and pages often comes as '(\d+ pages)' + pages = re.search(r'\d+', pages).group(0) + return u'Published %s, %s pages.' % (date, pages) + except: + pass + return None + +def _get_cover(soup, rdr): + ans = None + try: + ans = soup.find('img', alt=re.compile('cover', flags=re.I))['src'] + except TypeError: + # meeehh, no handy alt-tag goodness, try some hackery + # the basic idea behind this is that in general, the cover image + # has a height:width ratio of ~1.25, whereas most of the nav + # buttons are decidedly less than that. + # what we do in this is work out that ratio, take 1.25 off it and + # save the absolute value when we sort by this value, the smallest + # one is most likely to be the cover image, hopefully. + r = {} + for img in soup('img'): + try: + r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src'] + except KeyError: + # interestingly, occasionally the only image without height + # or width attrs is the cover... + r[0] = img['src'] + l = r.keys() + l.sort() + ans = r[l[0]] + # this link comes from the internal html, which is in a subdir + if ans is not None: + try: + ans = rdr.GetFile(ans) + except: + ans = rdr.root + "/" + ans + try: + ans = rdr.GetFile(ans) + except: + ans = None + if ans is not None: + from PIL import Image + from cStringIO import StringIO + buf = StringIO() + try: + Image.open(StringIO(ans)).convert('RGB').save(buf, 'JPEG') + ans = buf.getvalue() + except: + ans = None + return ans + + +def get_metadata_from_reader(rdr): + raw = rdr.GetFile(rdr.home) + home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0]) + + title = rdr.title + authors = _get_authors(home) + mi = MetaInformation(title, authors) + publisher = _get_publisher(home) + if publisher: + mi.publisher = publisher + isbn = _get_isbn(home) + if isbn: + mi.isbn = isbn + comments = _get_comments(home) + if comments: + mi.comments = comments + + cdata = _get_cover(home, rdr) + if cdata is not None: + mi.cover_data = ('jpg', cdata) + + return mi + +def get_metadata(stream): + with TemporaryFile('_chm_metadata.chm') as fname: + with open(fname, 'wb') as f: + f.write(stream.read()) + from calibre.ebooks.chm.reader import CHMReader + rdr = CHMReader(fname, default_log) + return get_metadata_from_reader(rdr) diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py index 33272e9695..412ca94d8a 100644 --- a/src/calibre/ebooks/chm/reader.py +++ b/src/calibre/ebooks/chm/reader.py @@ -135,8 +135,13 @@ class CHMReader(CHMFile): if guess_mimetype(path)[0] == ('text/html'): data = self._reformat(data) f.write(data) - #subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir]) self._extracted = True + files = os.listdir(output_dir) + if self.hhc_path not in files: + for f in files: + if f.lower() == self.hhc_path.lower(): + self.hhc_path = f + break def _reformat(self, data): try: diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index 043c8d7041..eff65fdb7b 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -20,7 +20,7 @@ What formats does |app| support conversion to/from? |app| supports the conversion of many input formats to many output formats. It can convert every input format in the following list, to every output format. -*Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT +*Input Formats:* CBZ, CBR, CBC, CHM, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT *Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TCR, TXT @@ -191,7 +191,7 @@ Library Management What formats does |app| read metadata from? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -|app| reads metadata from the following formats: LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI +|app| reads metadata from the following formats: CHM, LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI Where are the book files stored? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~