KG pre-6.44 release

2025-08-30 23:00:21 -04:00 · 2010-03-05 11:17:02 -08:00 · 2010-03-05 11:17:02 -08:00 · b4481eaf76
commit b4481eaf76
parent b8792909b1 fd8eef3e39
11 changed files with 227 additions and 27 deletions
--- a/resources/recipes/fudzilla.recipe
+++ b/resources/recipes/fudzilla.recipe
@ -1,27 +1,41 @@
 #!/usr/bin/env  python

 __license__   = 'GPL v3'
-__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2010 Starson17'
 '''
 fudzilla.com
 '''

+import re
 from calibre.web.feeds.news import BasicNewsRecipe

 class Fudzilla(BasicNewsRecipe):
    title                 = u'Fudzilla'
-    __author__            = 'Darko Miletic'
+    __author__            = 'Starson17'
    language = 'en'

    description           = 'Tech news'
    oldest_article        = 7
+    remove_javascript = True
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False

-    feeds = [ (u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1')]

-    def print_version(self, url):
-        nurl = url.replace('http://www.fudzilla.com/index.php','http://www.fudzilla.com/index2.php')
-        nmain, nsep, nrest = nurl.partition('&Itemid=')
-        return  nmain + '&pop=1&page=0&Itemid=1'
+    remove_tags_before = dict(name='div', attrs={'class':['padding']})
+
+    remove_tags = [dict(name='td', attrs={'class':['left','right']}),
+                   dict(name='div', attrs={'id':['toolbar','buttons']}), 
+                   dict(name='div', attrs={'class':['artbannersxtd','back_button']}), 
+                   dict(name='span', attrs={'class':['pathway']}), 
+                   dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}), 
+                   dict(name='table', attrs={'class':['headlines']}), 
+                   ]
+
+    feeds = [
+             (u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1')
+             ]
+
+    preprocess_regexps = [
+        (re.compile(r'<p class="MsoNormal"> Welcome.*</p> ', re.DOTALL|re.IGNORECASE), lambda match: '')
+        ]
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -110,7 +110,7 @@ class CHMMetadataReader(MetadataReaderPlugin):
    description = _('Read metadata from %s files') % 'CHM'

    def get_metadata(self, stream, ftype):
-        from calibre.ebooks.metadata.chm import get_metadata
+        from calibre.ebooks.chm.metadata import get_metadata
        return get_metadata(stream)


--- a/src/calibre/devices/kindle/driver.py
+++ b/src/calibre/devices/kindle/driver.py
@ -12,7 +12,6 @@ from cStringIO import StringIO
 from struct import unpack

 from calibre.devices.usbms.driver import USBMS
-from calibre.utils.logging import Log

 class KINDLE(USBMS):

@ -116,7 +115,6 @@ class KINDLE(USBMS):
                path_map.pop(id)
            return path_map, book_ext

-        log = Log()
        storage = get_storage()
        path_map, book_ext = resolve_bookmark_paths(storage, path_map)

@ -358,4 +356,4 @@ class Bookmark():
                    self.book_length = int(raw[idx+len('bookLength')+1:idx+len('bookLength')+1+length])

        else:
-            print "unsupported bookmark_extension: %s" % bookmark_extension
+            print "unsupported bookmark_extension: %s" % self.bookmark_extension
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@ -25,15 +25,16 @@ class CHMInput(InputFormatPlugin):
        rdr = CHMReader(chm_path, log)
        log.debug('Extracting CHM to %s' % output_dir)
        rdr.extract_content(output_dir)
+        self._chm_reader = rdr
        return rdr.hhc_path


    def convert(self, stream, options, file_ext, log, accelerators):
-        from calibre.ebooks.metadata.chm import get_metadata_
+        from calibre.ebooks.chm.metadata import get_metadata_from_reader
        from calibre.customize.ui import plugin_for_input_format

        log.debug('Processing CHM...')
-        with TemporaryDirectory('chm2oeb') as tdir:
+        with TemporaryDirectory('_chm2oeb') as tdir:
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
@ -48,8 +49,9 @@ class CHMInput(InputFormatPlugin):
            log.debug('stream.name=%s' % stream.name)
            mainname = self._chmtohtml(tdir, chm_name, no_images, log)
            mainpath = os.path.join(tdir, mainname)
+            #raw_input()

-            metadata = get_metadata_(tdir)
+            metadata = get_metadata_from_reader(self._chm_reader)

            odi = options.debug_pipeline
            options.debug_pipeline = None
@ -170,6 +172,7 @@ class CHMInput(InputFormatPlugin):
        if isinstance(node.tag, basestring):
            from calibre.ebooks.chm.reader import match_string

+            chapter_path = None
            if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
                for child in node:
                    if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
--- a/src/calibre/ebooks/chm/metadata.py
+++ b/src/calibre/ebooks/chm/metadata.py
@ -0,0 +1,157 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.ebooks.metadata import string_to_authors, MetaInformation
+from calibre.utils.logging import default_log
+from calibre.ptempfile import TemporaryFile
+
+def _clean(s):
+    return s.replace(u'\u00a0', u' ')
+
+def _detag(tag):
+    str = u""
+    for elem in tag:
+        if hasattr(elem, "contents"):
+            str += _detag(elem)
+        else:
+            str += _clean(elem)
+    return str
+
+
+def _metadata_from_table(soup, searchfor):
+    td = soup.find('td', text=re.compile(searchfor, flags=re.I))
+    if td is None:
+        return None
+    td = td.parent
+    # there appears to be multiple ways of structuring the metadata
+    # on the home page. cue some nasty special-case hacks...
+    if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
+        meta = _detag(td.findNextSibling('td'))
+        return re.sub('^:', '', meta).strip()
+    else:
+        meta = _detag(td)
+        return re.sub(r'^[^:]+:', '', meta).strip()
+
+def _metadata_from_span(soup, searchfor):
+    span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)})
+    if span is None:
+        return None
+    # this metadata might need some cleaning up still :/
+    return _detag(span.renderContents().strip())
+
+def _get_authors(soup):
+    aut = (_metadata_from_span(soup, r'author')
+        or _metadata_from_table(soup, r'^\s*by\s*:?\s+'))
+    ans = [_('Unknown')]
+    if aut is not None:
+        ans = string_to_authors(aut)
+    return ans
+
+def _get_publisher(soup):
+    return (_metadata_from_span(soup, 'imprint')
+        or _metadata_from_table(soup, 'publisher'))
+
+def _get_isbn(soup):
+    return (_metadata_from_span(soup, 'isbn')
+        or _metadata_from_table(soup, 'isbn'))
+
+def _get_comments(soup):
+    date = (_metadata_from_span(soup, 'cwdate')
+        or _metadata_from_table(soup, 'pub date'))
+    pages = ( _metadata_from_span(soup, 'pages')
+        or _metadata_from_table(soup, 'pages'))
+    try:
+        # date span can have copyright symbols in it...
+        date = date.replace(u'\u00a9', '').strip()
+        # and pages often comes as '(\d+ pages)'
+        pages = re.search(r'\d+', pages).group(0)
+        return u'Published %s, %s pages.' % (date, pages)
+    except:
+        pass
+    return None
+
+def _get_cover(soup, rdr):
+    ans = None
+    try:
+        ans = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
+    except TypeError:
+        # meeehh, no handy alt-tag goodness, try some hackery
+        # the basic idea behind this is that in general, the cover image
+        # has a height:width ratio of ~1.25, whereas most of the nav
+        # buttons are decidedly less than that.
+        # what we do in this is work out that ratio, take 1.25 off it and
+        # save the absolute value when we sort by this value, the smallest
+        # one is most likely to be the cover image, hopefully.
+        r = {}
+        for img in soup('img'):
+            try:
+                r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src']
+            except KeyError:
+                # interestingly, occasionally the only image without height
+                # or width attrs is the cover...
+                r[0] = img['src']
+        l = r.keys()
+        l.sort()
+        ans = r[l[0]]
+    # this link comes from the internal html, which is in a subdir
+    if ans is not None:
+        try:
+            ans = rdr.GetFile(ans)
+        except:
+            ans = rdr.root + "/" + ans
+            try:
+                ans = rdr.GetFile(ans)
+            except:
+                ans = None
+        if ans is not None:
+            from PIL import Image
+            from cStringIO import StringIO
+            buf = StringIO()
+            try:
+                Image.open(StringIO(ans)).convert('RGB').save(buf, 'JPEG')
+                ans = buf.getvalue()
+            except:
+                ans = None
+    return ans
+
+
+def get_metadata_from_reader(rdr):
+    raw = rdr.GetFile(rdr.home)
+    home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
+        resolve_entities=True)[0])
+
+    title = rdr.title
+    authors = _get_authors(home)
+    mi = MetaInformation(title, authors)
+    publisher = _get_publisher(home)
+    if publisher:
+        mi.publisher = publisher
+    isbn = _get_isbn(home)
+    if isbn:
+        mi.isbn = isbn
+    comments = _get_comments(home)
+    if comments:
+        mi.comments = comments
+
+    cdata = _get_cover(home, rdr)
+    if cdata is not None:
+        mi.cover_data = ('jpg', cdata)
+
+    return mi
+
+def get_metadata(stream):
+    with TemporaryFile('_chm_metadata.chm') as fname:
+        with open(fname, 'wb') as f:
+            f.write(stream.read())
+        from calibre.ebooks.chm.reader import CHMReader
+        rdr = CHMReader(fname, default_log)
+        return get_metadata_from_reader(rdr)
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@ -135,8 +135,13 @@ class CHMReader(CHMFile):
                if guess_mimetype(path)[0] == ('text/html'):
                    data = self._reformat(data)
                f.write(data)
-        #subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
        self._extracted = True
+        files = os.listdir(output_dir)
+        if self.hhc_path not in files:
+            for f in files:
+                if f.lower() == self.hhc_path.lower():
+                    self.hhc_path = f
+                    break

    def _reformat(self, data):
        try:
--- a/src/calibre/gui2/ui.py
+++ b/src/calibre/gui2/ui.py
@ -935,11 +935,11 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
 			return ids

 		def get_formats(id):
-			book_data = db.get_data_as_dict(ids=[id])[0]
-			formats = book_data['formats']
+			formats = db.formats(id, index_is_id=True)
 			fmts = []
-			for format in formats:
-				fmts.append(format.rpartition('.')[2])
+            if formats:
+                for format in formats.split(','):
+                    fmts.append(format.lower())
 			return fmts

 		def generate_annotation_paths(ids, db, device):
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -20,7 +20,7 @@ What formats does |app| support conversion to/from?
 |app| supports the conversion of many input formats to many output formats.
 It can convert every input format in the following list, to every output format.

-*Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT
+*Input Formats:* CBZ, CBR, CBC, CHM, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT

 *Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TCR, TXT

@ -191,7 +191,7 @@ Library Management

 What formats does |app| read metadata from?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-|app| reads metadata from the following formats: LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI
+|app| reads metadata from the following formats: CHM, LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI

 Where are the book files stored?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/src/calibre/web/feeds/input.py
+++ b/src/calibre/web/feeds/input.py
@ -103,7 +103,7 @@ class RecipeInput(InputFormatPlugin):
            ro.download()
            self.recipe_object = ro

-        for key, val in recipe.conversion_options.items():
+        for key, val in self.recipe_object.conversion_options.items():
            setattr(opts, key, val)

        for f in os.listdir('.'):
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -623,7 +623,7 @@ class BasicNewsRecipe(Recipe):
    def download(self):
        '''
        Download and pre-process all articles from the feeds in this recipe.
-        This method should be called only one on a particular Recipe instance.
+        This method should be called only once on a particular Recipe instance.
        Calling it more than once will lead to undefined behavior.
        @return: Path to index.html
        @rtype: string
@ -1358,3 +1358,26 @@ class AutomaticNewsRecipe(BasicNewsRecipe):
        if self.use_embedded_content:
            self.web2disk_options.keep_only_tags = []
        return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)
+
+class DownloadedNewsRecipe(BasicNewsRecipe):
+
+    def get_downloaded_recipe(self):
+        'Return path on local filesystem to downloaded recipe'
+        raise NotImplementedError
+
+    def download(self):
+        self.log('Fetching downloaded recipe')
+        rpath = self.get_downloaded_recipe()
+        from calibre.utils.zipfile import ZipFile
+        zf = ZipFile(rpath)
+        zf.extractall()
+        zf.close()
+        from calibre.web.feeds.recipes import compile_recipe
+        from glob import glob
+        try:
+            recipe = compile_recipe(open(glob('*.downloaded_recipe')[0],
+                'rb').read())
+            self.conversion_options = recipe.conversion_options
+        except:
+            self.log.exception('Failed to compile downloaded recipe')
+        return os.path.abspath('index.html')