From bac7e6b78c38b86a58cab81de20d06f5010eda1f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 4 Mar 2010 22:10:36 -0700
Subject: [PATCH 1/3] ...

---
 src/calibre/web/feeds/input.py |  2 +-
 src/calibre/web/feeds/news.py  | 25 ++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/calibre/web/feeds/input.py b/src/calibre/web/feeds/input.py
index 51aaeb3e4b..f3a7d01917 100644
--- a/src/calibre/web/feeds/input.py
+++ b/src/calibre/web/feeds/input.py
@@ -103,7 +103,7 @@ class RecipeInput(InputFormatPlugin):
             ro.download()
             self.recipe_object = ro
 
-        for key, val in recipe.conversion_options.items():
+        for key, val in self.recipe_object.conversion_options.items():
             setattr(opts, key, val)
 
         for f in os.listdir('.'):
diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index d0c9d941e3..d07c135abd 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -623,7 +623,7 @@ class BasicNewsRecipe(Recipe):
     def download(self):
         '''
         Download and pre-process all articles from the feeds in this recipe.
-        This method should be called only one on a particular Recipe instance.
+        This method should be called only once on a particular Recipe instance.
         Calling it more than once will lead to undefined behavior.
         @return: Path to index.html
         @rtype: string
@@ -1358,3 +1358,26 @@ class AutomaticNewsRecipe(BasicNewsRecipe):
         if self.use_embedded_content:
             self.web2disk_options.keep_only_tags = []
         return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)
+
+class DownloadedNewsRecipe(BasicNewsRecipe):
+
+    def get_downloaded_recipe(self):
+        'Return path on local filesystem to downloaded recipe'
+        raise NotImplementedError
+
+    def download(self):
+        self.log('Fetching downloaded recipe')
+        rpath = self.get_downloaded_recipe()
+        from calibre.utils.zipfile import ZipFile
+        zf = ZipFile(rpath)
+        zf.extractall()
+        zf.close()
+        from calibre.web.feeds.recipes import compile_recipe
+        from glob import glob
+        try:
+            recipe = compile_recipe(open(glob('*.downloaded_recipe')[0],
+                'rb').read())
+            self.conversion_options = recipe.conversion_options
+        except:
+            self.log.exception('Failed to compile downloaded recipe')
+        return os.path.abspath('index.html')

From c7e8c889a4c00bfcebb95d44ffff54bb32abdec8 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 5 Mar 2010 09:49:04 -0700
Subject: [PATCH 2/3] Fix Fudzilla

---
 resources/recipes/fudzilla.recipe | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/resources/recipes/fudzilla.recipe b/resources/recipes/fudzilla.recipe
index e7f3d99fe9..821488ad0a 100644
--- a/resources/recipes/fudzilla.recipe
+++ b/resources/recipes/fudzilla.recipe
@@ -1,27 +1,41 @@
 #!/usr/bin/env  python
 
 __license__   = 'GPL v3'
-__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2010 Starson17'
 '''
 fudzilla.com
 '''
 
+import re
 from calibre.web.feeds.news import BasicNewsRecipe
 
 class Fudzilla(BasicNewsRecipe):
     title                 = u'Fudzilla'
-    __author__            = 'Darko Miletic'
+    __author__            = 'Starson17'
     language = 'en'
 
     description           = 'Tech news'
     oldest_article        = 7
+    remove_javascript = True
     max_articles_per_feed = 100
     no_stylesheets        = True
     use_embedded_content  = False
 
-    feeds = [ (u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1')]
 
-    def print_version(self, url):
-        nurl = url.replace('http://www.fudzilla.com/index.php','http://www.fudzilla.com/index2.php')
-        nmain, nsep, nrest = nurl.partition('&Itemid=')
-        return  nmain + '&pop=1&page=0&Itemid=1'
+    remove_tags_before = dict(name='div', attrs={'class':['padding']})
+
+    remove_tags = [dict(name='td', attrs={'class':['left','right']}),
+                   dict(name='div', attrs={'id':['toolbar','buttons']}), 
+                   dict(name='div', attrs={'class':['artbannersxtd','back_button']}), 
+                   dict(name='span', attrs={'class':['pathway']}), 
+                   dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}), 
+                   dict(name='table', attrs={'class':['headlines']}), 
+                   ]
+
+    feeds = [
+             (u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1')
+             ]
+
+    preprocess_regexps = [
+        (re.compile(r'<p class="MsoNormal"> Welcome.*</p> ', re.DOTALL|re.IGNORECASE), lambda match: '')
+        ]

From eae90e2ef409e1a8fffa9bd904beb9764edffc85 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 5 Mar 2010 10:49:28 -0700
Subject: [PATCH 3/3] Implement #4825 (CHM format)

---
 src/calibre/customize/builtins.py  |   2 +-
 src/calibre/ebooks/chm/input.py    |   9 +-
 src/calibre/ebooks/chm/metadata.py | 157 +++++++++++++++++++++++++++++
 src/calibre/ebooks/chm/reader.py   |   7 +-
 src/calibre/manual/faq.rst         |   4 +-
 5 files changed, 172 insertions(+), 7 deletions(-)
 create mode 100644 src/calibre/ebooks/chm/metadata.py

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 0ba197fac3..391b7d22e6 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -110,7 +110,7 @@ class CHMMetadataReader(MetadataReaderPlugin):
     description = _('Read metadata from %s files') % 'CHM'
 
     def get_metadata(self, stream, ftype):
-        from calibre.ebooks.metadata.chm import get_metadata
+        from calibre.ebooks.chm.metadata import get_metadata
         return get_metadata(stream)
 
 
diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py
index e2a270f2b8..3f0aa21f08 100644
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@@ -25,15 +25,16 @@ class CHMInput(InputFormatPlugin):
         rdr = CHMReader(chm_path, log)
         log.debug('Extracting CHM to %s' % output_dir)
         rdr.extract_content(output_dir)
+        self._chm_reader = rdr
         return rdr.hhc_path
 
 
     def convert(self, stream, options, file_ext, log, accelerators):
-        from calibre.ebooks.metadata.chm import get_metadata_
+        from calibre.ebooks.chm.metadata import get_metadata_from_reader
         from calibre.customize.ui import plugin_for_input_format
 
         log.debug('Processing CHM...')
-        with TemporaryDirectory('chm2oeb') as tdir:
+        with TemporaryDirectory('_chm2oeb') as tdir:
             html_input = plugin_for_input_format('html')
             for opt in html_input.options:
                 setattr(options, opt.option.name, opt.recommended_value)
@@ -48,8 +49,9 @@ class CHMInput(InputFormatPlugin):
             log.debug('stream.name=%s' % stream.name)
             mainname = self._chmtohtml(tdir, chm_name, no_images, log)
             mainpath = os.path.join(tdir, mainname)
+            #raw_input()
 
-            metadata = get_metadata_(tdir)
+            metadata = get_metadata_from_reader(self._chm_reader)
 
             odi = options.debug_pipeline
             options.debug_pipeline = None
@@ -170,6 +172,7 @@ class CHMInput(InputFormatPlugin):
         if isinstance(node.tag, basestring):
             from calibre.ebooks.chm.reader import match_string
 
+            chapter_path = None
             if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
                 for child in node:
                     if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py
new file mode 100644
index 0000000000..7386d54658
--- /dev/null
+++ b/src/calibre/ebooks/chm/metadata.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.ebooks.metadata import string_to_authors, MetaInformation
+from calibre.utils.logging import default_log
+from calibre.ptempfile import TemporaryFile
+
+def _clean(s):
+    return s.replace(u'\u00a0', u' ')
+
+def _detag(tag):
+    str = u""
+    for elem in tag:
+        if hasattr(elem, "contents"):
+            str += _detag(elem)
+        else:
+            str += _clean(elem)
+    return str
+
+
+def _metadata_from_table(soup, searchfor):
+    td = soup.find('td', text=re.compile(searchfor, flags=re.I))
+    if td is None:
+        return None
+    td = td.parent
+    # there appears to be multiple ways of structuring the metadata
+    # on the home page. cue some nasty special-case hacks...
+    if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
+        meta = _detag(td.findNextSibling('td'))
+        return re.sub('^:', '', meta).strip()
+    else:
+        meta = _detag(td)
+        return re.sub(r'^[^:]+:', '', meta).strip()
+
+def _metadata_from_span(soup, searchfor):
+    span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)})
+    if span is None:
+        return None
+    # this metadata might need some cleaning up still :/
+    return _detag(span.renderContents().strip())
+
+def _get_authors(soup):
+    aut = (_metadata_from_span(soup, r'author')
+        or _metadata_from_table(soup, r'^\s*by\s*:?\s+'))
+    ans = [_('Unknown')]
+    if aut is not None:
+        ans = string_to_authors(aut)
+    return ans
+
+def _get_publisher(soup):
+    return (_metadata_from_span(soup, 'imprint')
+        or _metadata_from_table(soup, 'publisher'))
+
+def _get_isbn(soup):
+    return (_metadata_from_span(soup, 'isbn')
+        or _metadata_from_table(soup, 'isbn'))
+
+def _get_comments(soup):
+    date = (_metadata_from_span(soup, 'cwdate')
+        or _metadata_from_table(soup, 'pub date'))
+    pages = ( _metadata_from_span(soup, 'pages')
+        or _metadata_from_table(soup, 'pages'))
+    try:
+        # date span can have copyright symbols in it...
+        date = date.replace(u'\u00a9', '').strip()
+        # and pages often comes as '(\d+ pages)'
+        pages = re.search(r'\d+', pages).group(0)
+        return u'Published %s, %s pages.' % (date, pages)
+    except:
+        pass
+    return None
+
+def _get_cover(soup, rdr):
+    ans = None
+    try:
+        ans = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
+    except TypeError:
+        # meeehh, no handy alt-tag goodness, try some hackery
+        # the basic idea behind this is that in general, the cover image
+        # has a height:width ratio of ~1.25, whereas most of the nav
+        # buttons are decidedly less than that.
+        # what we do in this is work out that ratio, take 1.25 off it and
+        # save the absolute value when we sort by this value, the smallest
+        # one is most likely to be the cover image, hopefully.
+        r = {}
+        for img in soup('img'):
+            try:
+                r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src']
+            except KeyError:
+                # interestingly, occasionally the only image without height
+                # or width attrs is the cover...
+                r[0] = img['src']
+        l = r.keys()
+        l.sort()
+        ans = r[l[0]]
+    # this link comes from the internal html, which is in a subdir
+    if ans is not None:
+        try:
+            ans = rdr.GetFile(ans)
+        except:
+            ans = rdr.root + "/" + ans
+            try:
+                ans = rdr.GetFile(ans)
+            except:
+                ans = None
+        if ans is not None:
+            from PIL import Image
+            from cStringIO import StringIO
+            buf = StringIO()
+            try:
+                Image.open(StringIO(ans)).convert('RGB').save(buf, 'JPEG')
+                ans = buf.getvalue()
+            except:
+                ans = None
+    return ans
+
+
+def get_metadata_from_reader(rdr):
+    raw = rdr.GetFile(rdr.home)
+    home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
+        resolve_entities=True)[0])
+
+    title = rdr.title
+    authors = _get_authors(home)
+    mi = MetaInformation(title, authors)
+    publisher = _get_publisher(home)
+    if publisher:
+        mi.publisher = publisher
+    isbn = _get_isbn(home)
+    if isbn:
+        mi.isbn = isbn
+    comments = _get_comments(home)
+    if comments:
+        mi.comments = comments
+
+    cdata = _get_cover(home, rdr)
+    if cdata is not None:
+        mi.cover_data = ('jpg', cdata)
+
+    return mi
+
+def get_metadata(stream):
+    with TemporaryFile('_chm_metadata.chm') as fname:
+        with open(fname, 'wb') as f:
+            f.write(stream.read())
+        from calibre.ebooks.chm.reader import CHMReader
+        rdr = CHMReader(fname, default_log)
+        return get_metadata_from_reader(rdr)
diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py
index 33272e9695..412ca94d8a 100644
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@@ -135,8 +135,13 @@ class CHMReader(CHMFile):
                 if guess_mimetype(path)[0] == ('text/html'):
                     data = self._reformat(data)
                 f.write(data)
-        #subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
         self._extracted = True
+        files = os.listdir(output_dir)
+        if self.hhc_path not in files:
+            for f in files:
+                if f.lower() == self.hhc_path.lower():
+                    self.hhc_path = f
+                    break
 
     def _reformat(self, data):
         try:
diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index 043c8d7041..eff65fdb7b 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -20,7 +20,7 @@ What formats does |app| support conversion to/from?
 |app| supports the conversion of many input formats to many output formats.
 It can convert every input format in the following list, to every output format.
 
-*Input Formats:* CBZ, CBR, CBC, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT
+*Input Formats:* CBZ, CBR, CBC, CHM, EPUB, FB2, HTML, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, TCR, TXT
 
 *Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, PDB, PML, RB, PDF, TCR, TXT
 
@@ -191,7 +191,7 @@ Library Management
 
 What formats does |app| read metadata from?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-|app| reads metadata from the following formats: LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI
+|app| reads metadata from the following formats: CHM, LRF, PDF, LIT, RTF, OPF, MOBI, PRC, EPUB, FB2, IMP, RB, HTML. In addition it can write metadata to: LRF, RTF, OPF, EPUB, PDF, MOBI
 
 Where are the book files stored?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~