GwR utf-8 fix for Topaz metadata

2025-07-09 03:04:10 -04:00 · 2010-03-09 14:00:01 -07:00 · 2010-03-09 14:00:01 -07:00 · 4fce3578ff
commit 4fce3578ff
parent 95f504e096 4e943c0224
13 changed files with 153 additions and 20 deletions
--- a/resources/images/news/elsevier.png
+++ b/resources/images/news/elsevier.png
--- a/resources/recipes/elsevier.recipe
+++ b/resources/recipes/elsevier.recipe
@ -0,0 +1,65 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+'''
+elsevier.nl
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Pagina12(BasicNewsRecipe):
+    title                 = 'Elsevier.nl'
+    __author__            = 'Darko Miletic'
+    description           = 'News from Denmark'
+    publisher             = 'elsevier.nl'
+    category              = 'news, politics, Denmark'
+    oldest_article        = 2
+    max_articles_per_feed = 200
+    no_stylesheets        = True
+    encoding              = 'utf-8'
+    use_embedded_content  = False
+    language              = 'nl'
+    country               = 'NL'
+    remove_empty_feeds    = True
+    masthead_url          = 'http://www.elsevier.nl/static/elsevier/stdimg/logo.gif'
+    extra_css             = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em}  '
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+    keep_only_tags     = dict(attrs={'id':'artikel_container'})
+    remove_tags_before = dict(attrs={'id':'breadcrumb_container'})
+    remove_tags_after  = dict(attrs={'class':'author_link'})
+    remove_tags        = [
+                          dict(attrs={'id':'breadcrumb_container'})
+                         ,dict(name='div',attrs={'class':'pullout_vak'})
+                         ]
+    remove_attributes  = ['width','height']
+
+    feeds = [
+               (u'Laatste nieuws'      , u'http://www.elsevier.nl/web/RSS/Homepage-RSS.htm?output=xml'         )
+              ,(u'Nederland'           , u'http://www.elsevier.nl/web/RSS/Nederland-RSS.htm?output=xml'        )
+              ,(u'Politiek'            , u'http://www.elsevier.nl/web/RSS/Politiek-RSS.htm?output=xml'         )
+              ,(u'Europese Unie'       , u'http://www.elsevier.nl/web/RSS/Europese-Unie-RSS.htm?output=xml'    )
+              ,(u'Buitenland'          , u'http://www.elsevier.nl/web/RSS/Buitenland-RSS.htm?output=xml'       )
+              ,(u'Economie'            , u'http://www.elsevier.nl/web/RSS/Economie-RSS.htm?output=xml'         )
+              ,(u'Wetenschap'          , u'http://www.elsevier.nl/web/RSS/Wetenschap-RSS.htm?output=xml'       )
+              ,(u'Cultuur & Televisie' , u'http://www.elsevier.nl/web/RSS/Cultuur-Televisie-RSS.htm?output=xml')
+              ,(u'Society'             , u'http://www.elsevier.nl/web/RSS/Society-RSS.htm?output=xml'          )
+              ,(u'Internet&/Gadgets'   , u'http://www.elsevier.nl/web/RSS/Internet-Gadgets-RSS.htm?output=xml' )
+              ,(u'Comentaren'          , u'http://www.elsevier.nl/web/RSS/Commentaren-RSS.htm?output=xml'      )
+            ]
+
+    def print_version(self, url):
+        return url + '?print=true'
+
+    def get_article_url(self, article):
+        return article.get('guid',  None).rpartition('?')[0]
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        return soup
--- a/resources/recipes/taz.recipe
+++ b/resources/recipes/taz.recipe
@ -0,0 +1,62 @@
+#!/usr/bin/env  python
+# -*- coding: utf-8 -*-
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Lars Jacob jacob.lars at gmail.com'
+__docformat__ = 'restructuredtext de'
+
+'''
+www.taz.de/digiabo
+'''
+import os, urllib2, zipfile, tempfile
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class TazDigiabo(BasicNewsRecipe):
+
+	title = u'Taz Digiabo'
+	description = u'Das EPUB DigiAbo der Taz'
+	language = 'de'
+	lang = 'de-DE'
+
+	__author__ = 'Lars Jacob'
+	needs_subscription = True
+
+	conversion_options = {
+		'no_default_epub_cover' : True
+	}
+
+	def build_index(self):
+		if self.username is not None and self.password is not None:
+			domain = "http://www.taz.de"
+
+			url = domain + "/epub/"
+
+			auth_handler = urllib2.HTTPBasicAuthHandler()
+			auth_handler.add_password(realm='TAZ-ABO',
+									  uri=url,
+									  user=self.username,
+									  passwd=self.password)
+			opener = urllib2.build_opener(auth_handler)
+			urllib2.install_opener(opener)
+
+			try:
+				f = urllib2.urlopen(url)
+			except urllib2.HTTPError:
+				self.report_progress(0,_('Can\'t login to download issue'))
+				return
+
+			tmp = tempfile.TemporaryFile()
+			self.report_progress(0,_('downloading epub'))
+			tmp.write(f.read())
+
+			zfile = zipfile.ZipFile(tmp, 'r')
+			self.report_progress(0,_('extracting epub'))
+
+			zfile.extractall(self.output_dir)
+
+			tmp.close()
+			index = os.path.join(self.output_dir, 'content.opf')
+
+			self.report_progress(1,_('epub downloaded and extracted'))
+
+			return index
--- a/src/calibre/devices/hanlin/driver.py
+++ b/src/calibre/devices/hanlin/driver.py
@ -115,7 +115,7 @@ class BOOX(HANLINV3):
    supported_platforms = ['windows', 'osx', 'linux']

    # Ordered list of supported formats
-    FORMATS     = ['epub', 'fb2', 'pdf', 'html', 'txt', 'rtf', 'mobi', 'prc', 'chm']
+    FORMATS     = ['epub', 'fb2', 'djvu', 'pdf', 'html', 'txt', 'rtf', 'mobi', 'prc', 'chm']

    VENDOR_ID   = [0x0525]
    PRODUCT_ID  = [0xa4a5]
--- a/src/calibre/devices/iriver/driver.py
+++ b/src/calibre/devices/iriver/driver.py
@ -32,6 +32,7 @@ class IRIVER_STORY(USBMS):

    MAIN_MEMORY_VOLUME_LABEL  = 'Story Main Memory'
    STORAGE_CARD_VOLUME_LABEL = 'Story Storage Card'
+    EBOOK_DIR_MAIN = 'Book'

    SUPPORTS_SUB_DIRS = True

--- a/src/calibre/ebooks/chm/metadata.py
+++ b/src/calibre/ebooks/chm/metadata.py
@ -19,6 +19,8 @@ def _clean(s):

 def _detag(tag):
    str = u""
+    if tag is None:
+        return str
    for elem in tag:
        if hasattr(elem, "contents"):
            str += _detag(elem)
@ -34,7 +36,7 @@ def _metadata_from_table(soup, searchfor):
    td = td.parent
    # there appears to be multiple ways of structuring the metadata
    # on the home page. cue some nasty special-case hacks...
-    if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
+    if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(None), flags=re.I):
        meta = _detag(td.findNextSibling('td'))
        return re.sub('^:', '', meta).strip()
    else:
@ -46,7 +48,7 @@ def _metadata_from_span(soup, searchfor):
    if span is None:
        return None
    # this metadata might need some cleaning up still :/
-    return _detag(span.renderContents().strip())
+    return _detag(span.renderContents(None).strip())

 def _get_authors(soup):
    aut = (_metadata_from_span(soup, r'author')
--- a/src/calibre/ebooks/metadata/rar.py
+++ b/src/calibre/ebooks/metadata/rar.py
@ -32,7 +32,7 @@ def get_metadata(stream):
        if stream_type:
            stream_type = stream_type[1:]
            if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub',
-                               'rb', 'imp', 'pdf', 'lrf'):
+                               'rb', 'imp', 'pdf', 'lrf', 'azw'):
                with TemporaryDirectory() as tdir:
                    with CurrentDir(tdir):
                       stream = extract_member(path, match=None, name=f,
--- a/src/calibre/ebooks/metadata/topaz.py
+++ b/src/calibre/ebooks/metadata/topaz.py
@ -272,20 +272,19 @@ class MetadataUpdater(object):
    def generate_metadata_stream(self):
        ms = StringIO.StringIO()
        # Generate the header
-        ms.write(self.encode_vwi(len(self.md_header['tag'])))
+        ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1'))
        ms.write(self.md_header['tag'])
        ms.write(chr(self.md_header['flags']))
        ms.write(chr(len(self.metadata)))

-        # Add the metadata fields
+        # Add the metadata fields.
        for item in self.metadata:
-            ms.write(self.encode_vwi(len(self.metadata[item]['tag'])))
+            ms.write(self.encode_vwi(len(self.metadata[item]['tag'])).encode('iso-8859-1'))
            ms.write(self.metadata[item]['tag'])
-            ms.write(self.encode_vwi(len(self.metadata[item]['metadata'])))
+            ms.write(self.encode_vwi(len(self.metadata[item]['metadata'])).encode('iso-8859-1'))
            ms.write(self.metadata[item]['metadata'])

-        return ms.getvalue().encode('iso-8859-1')
-        #return ms.getvalue().encode('utf-8')
+        return ms.getvalue()

    def get_md_header(self,offset):
        md_header = {}
@ -344,11 +343,11 @@ class MetadataUpdater(object):

        if mi.author_sort and pas:
            authors = mi.author_sort
-            update_metadata('Authors',authors)
+            update_metadata('Authors',authors.encode('utf-8'))
        elif mi.authors:
            authors = '; '.join(mi.authors)
            update_metadata('Authors',authors)
-        update_metadata('Title',mi.title)
+        update_metadata('Title',mi.title.encode('utf-8'))

        updated_metadata = self.generate_metadata_stream()
        head = self.fixup_topaz_headers(len(updated_metadata) - self.orig_md_len)
--- a/src/calibre/ebooks/metadata/zip.py
+++ b/src/calibre/ebooks/metadata/zip.py
@ -23,7 +23,7 @@ def get_metadata(stream):
        if stream_type:
            stream_type = stream_type[1:]
            if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub',
-                               'rb', 'imp', 'pdf', 'lrf'):
+                               'rb', 'imp', 'pdf', 'lrf', 'azw'):
                with TemporaryDirectory() as tdir:
                    with CurrentDir(tdir):
                        path = zf.extract(f)
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@ -102,7 +102,7 @@ class CSV_XML(CatalogPlugin):
                    if field == 'formats':
                        fmt_list = []
                        for format in item:
-                            fmt_list.append(format.partition('.')[2])
+                            fmt_list.append(format.rpartition('.')[2].lower())
                        item = ', '.join(fmt_list)
                    elif field in ['authors','tags']:
                        item = ', '.join(item)
@ -3869,9 +3869,7 @@ class EPUB_MOBI(CatalogPlugin):
                elem.extract()

            # Reconstruct comments w/o <div>s
-            comments = soup.renderContents()
-            if not isinstance(comments, unicode):
-                comments = comments.decode('utf-8', 'replace')
+            comments = soup.renderContents(None)

            # Convert \n\n to <p>s
            if re.search('\n\n', comments):
@ -3883,7 +3881,7 @@ class EPUB_MOBI(CatalogPlugin):
                    pTag.insert(0,p)
                    soup.insert(tsc,pTag)
                    tsc += 1
-                comments = soup.renderContents()
+                comments = soup.renderContents(None)

            # Convert solo returns to <br />
            comments = re.sub('[\r\n]','<br />', comments)
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -313,3 +313,9 @@ I want some feature added to |app|. What can I do?
 You have two choices: 
 1. Create a patch by hacking on |app| and send it to me for review and inclusion. See `Development <http://calibre-ebook.com/get-involved>`_. 
 2. `Open a ticket <http://bugs.calibre-ebook.com/newticket>`_ (you have to register and login first) and hopefully I will find the time to implement your feature.
+
+Can I include |app| on a CD to be distributed with my product/magazine?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+|app| is licensed under the GNU General Public License v3 (an open source license). This means that you are free to redistribute |app| as long as you make the source code available. So if you want to put |app| on a CD with your product, you must also put the |app| source code on the CD. The source code is available for download `here <http://code.google.com/p/calibre-ebook/downloads/list>`_.
+
+
--- a/src/calibre/translations/ru.po
+++ b/src/calibre/translations/ru.po
@ -6492,7 +6492,7 @@ msgstr ""
 "                <ol>\n"
 "                <li>Отключите устройство. Дождитесь завершения  создания "
 "базы данных (т.е. дождитесь ее готовности к использованию). Подключите "
-"устройство. Теперь должно все заработать в приложении %(apps)s. Если этого "
+"устройство. Теперь должно все заработать в приложении %(app)s. Если этого "
 "не произошло, то переходите к следующему пункту.</li>\n"
 "                <li>Закройте %(app)s приложение. Найдите файл media.xml в "
 "основной памяти устройства. Удалите его. Отключите устройство. Подождите "
--- a/src/calibre/utils/smtp.py
+++ b/src/calibre/utils/smtp.py
@ -135,7 +135,7 @@ def option_parser():
                            'to SMTP server.'))
    r=parser.add_option_group('SMTP RELAY',
        'Options to use an SMTP relay server to send mail. '
-        '%prog will try to send the email directly unless --relay is '
+        'calibre will try to send the email directly unless --relay is '
        'specified.').add_option
    r('-r', '--relay', help=('An SMTP relay server to use to send mail.'))
    r('-p', '--port', default=-1,