Added recipes for the Japan Times and The San Francisco Chronicle (thanks to Darko Miletic) as well as various minor bug fixes

2025-11-13 10:06:59 -05:00 · 2008-11-15 10:46:04 -08:00 · 2008-11-15 10:46:04 -08:00 · 0433c934a6
commit 0433c934a6
parent 2938b2802e
14 changed files with 109 additions and 15 deletions
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -304,7 +304,10 @@ def convert(htmlfile, opts, notification=None):
        
        opf.add_path_to_manifest(os.path.join(tdir, 'content', 'resources', '_cover_.jpg'), 'image/jpeg')    
        with open(opf_path, 'wb') as f:
-            f.write(opf.render())
+            raw = opf.render()
+            if not raw.startswith('<?xml '):
+                raw = '<?xml version="1.0"  encoding="UTF-8"?>\n'+raw
+            f.write(raw)
        epub = initialize_container(opts.output)
        epub.add_dir(tdir)
        if opts.show_opf:
--- a/src/calibre/ebooks/metadata/epub.py
+++ b/src/calibre/ebooks/metadata/epub.py
@ -235,7 +235,7 @@ def main(args=sys.argv):
        
        if changed:
            set_metadata(stream, mi)
-        print unicode(get_metadata(stream, extract_cover=False))
+        print unicode(get_metadata(stream, extract_cover=False)).encode('utf-8')
        
    if mi.cover_data[1] is not None:
        cpath = os.path.splitext(os.path.basename(args[1]))[0] + '_cover.jpg'
--- a/src/calibre/ebooks/metadata/meta.py
+++ b/src/calibre/ebooks/metadata/meta.py
@ -20,6 +20,7 @@ from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ebooks.metadata.rtf  import set_metadata as set_rtf_metadata
 from calibre.ebooks.lrf.meta      import set_metadata as set_lrf_metadata
 from calibre.ebooks.metadata.epub import set_metadata as set_epub_metadata
+from calibre.ebooks.metadata.pdf  import set_metadata as set_pdf_metadata
 try:
    from calibre.libunrar import extract_member as rar_extract_first
 except OSError:
@ -122,8 +123,6 @@ def get_comic_cover(stream, type):
        ext = os.path.splitext(path)[1][1:]
        return (ext.lower(), data)
        
-        
-
 def set_metadata(stream, mi, stream_type='lrf'):
    if stream_type: stream_type = stream_type.lower()
    if stream_type == 'lrf':
@ -132,6 +131,8 @@ def set_metadata(stream, mi, stream_type='lrf'):
        set_epub_metadata(stream, mi)
    elif stream_type == 'rtf':
        set_rtf_metadata(stream, mi)
+    #elif stream_type == 'pdf':
+    #    set_pdf_metadata(stream, mi)

 def metadata_from_filename(name, pat=None):
    name = os.path.splitext(name)[0]
--- a/src/calibre/ebooks/metadata/opf.py
+++ b/src/calibre/ebooks/metadata/opf.py
@ -519,6 +519,8 @@ class OPFCreator(MetaInformation):
        self.guide.set_basedir(self.base_path)
        
        opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml')
+        if not opf.startswith('<?xml '):
+            opf = '<?xml version="1.0"  encoding="UTF-8"?>\n'+opf
        opf_stream.write(opf)
        opf_stream.flush()
        toc = getattr(self, 'toc', None)
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@ -2,9 +2,9 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''Read meta information from PDF files'''

-import sys, os
+import sys, os, re

-from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser
 from pyPdf import PdfFileReader

 def get_metadata(stream):
@ -28,16 +28,43 @@ def get_metadata(stream):
        msg = u'Couldn\'t read metadata from pdf: %s with error %s'%(mi.title, unicode(err))
        print >>sys.stderr, msg.encode('utf8')
    return mi
-        
+
+def set_metadata(stream, mi):
+    stream.seek(0)
+    raw = stream.read()
+    if mi.title:
+        tit = mi.title.encode('utf-8') if isinstance(mi.title, unicode) else mi.title
+        raw = re.compile(r'<<.*?/Title\((.+?)\)', re.DOTALL).sub(lambda m: m.group().replace(m.group(1), tit), raw)
+    if mi.authors:
+        au = authors_to_string(mi.authors)
+        if isinstance(au, unicode):
+            au = au.encode('utf-8')
+        raw = re.compile(r'<<.*?/Author\((.+?)\)', re.DOTALL).sub(lambda m: m.group().replace(m.group(1), au), raw)
+    stream.seek(0)
+    stream.truncate()
+    stream.write(raw)
+    stream.seek(0)
+
+def option_parser():
+    p = get_parser('pdf')
+    p.remove_option('--category')
+    p.remove_option('--comment')
+    return p
            
 def main(args=sys.argv):
+    #p = option_parser()
+    #opts, args = p.parse_args(args)
    if len(args) != 2:
        print >>sys.stderr, _('Usage: pdf-meta file.pdf')
        print >>sys.stderr, _('No filename specified.')
        return 1
    
-    path = os.path.abspath(os.path.expanduser(args[1]))
-    print get_metadata(open(path, 'rb'))
+    stream = open(os.path.abspath(os.path.expanduser(args[1])), 'r+b')
+    #mi = MetaInformation(opts.title, opts.authors)
+    #if mi.title or mi.authors:
+    #    set_metadata(stream, mi)
+    print unicode(get_metadata(stream)).encode('utf-8')
+    
    return 0

 if __name__ == '__main__':
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@ -414,7 +414,8 @@ except:
 class Application(QApplication):
    
    def __init__(self, args):
-        QApplication.__init__(self, args)
+        qargs = [i.encode('utf-8') if isinstance(i, unicode) else i for i in args]
+        QApplication.__init__(self, qargs)
        self.translator = QTranslator(self)
        lang = get_lang()
        if lang:
--- a/src/calibre/gui2/main.py
+++ b/src/calibre/gui2/main.py
@ -667,9 +667,10 @@ class Main(MainWindow, Ui_MainWindow):
            self.library_view.model().refresh_ids([id])

        for row in rows:
-            MetadataSingleDialog(self, row.row(),
+            d = MetadataSingleDialog(self, row.row(),
                                    self.library_view.model().db,
                                    accepted_callback=accepted)
+            d.exec_()

    def edit_bulk_metadata(self, checked):
        '''
--- a/src/calibre/library/server.py
+++ b/src/calibre/library/server.py
@ -252,7 +252,7 @@ class LibraryServer(object):
                    extra.append('RATING: %s<br />'%rating)
                tags = record[FIELD_MAP['tags']]
                if tags:
-                    extra.append('TAGS: %s<br />'%', '.join(tags))
+                    extra.append('TAGS: %s<br />'%', '.join(tags.split(',')))
                series = record[FIELD_MAP['series']]
                if series:
                    extra.append('SERIES: %s [%d]<br />'%(series, record[FIELD_MAP['series_index']]))
--- a/src/calibre/library/static/index.html
+++ b/src/calibre/library/static/index.html
@ -11,7 +11,7 @@
 	</head>
 	<body>
 	    <div id="banner">
-	        <img src="/static/calibre.png" alt="calibre" />
+	        <a style="border: 0pt" href="http://calibre.kovidgoyal.net" alt="calibre" title="calibre"><img style="border:0pt" src="/static/calibre.png" alt="calibre" /></a>
        </div>
        
 	    <div id="search_box">
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -500,7 +500,6 @@ class BasicNewsRecipe(object, LoggingInterface):
        if self.no_stylesheets:
            for link in list(soup.findAll('link', type=re.compile('css')))+list(soup.findAll('style')):
                link.extract()
-        
        head = soup.find('head')
        if not head:
            head = soup.find('body')
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -11,7 +11,7 @@ recipes = [
           'ars_technica', 'upi', 'new_yorker', 'irish_times', 'iht', 'lanacion',
           'discover_magazine', 'scientific_american', 'new_york_review_of_books',
           'daily_telegraph', 'guardian', 'el_pais', 'new_scientist', 'b92', 
-           'politika', 'moscow_times', 'latimes'
+           'politika', 'moscow_times', 'latimes', 'japan_times', 'san_fran_chronicle',
          ]

 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/japan_times.py
+++ b/src/calibre/web/feeds/recipes/japan_times.py
@ -0,0 +1,30 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
+'''
+japantimes.co.jp
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class JapanTimes(BasicNewsRecipe):
+    title                 = u'The Japan Times'
+    __author__            = 'Darko Miletic'
+    description           = 'News from Japan'    
+    oldest_article        = 7
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+
+    keep_only_tags    = [ dict(name='div', attrs={'id':'searchresult'}) ]
+    remove_tags_after = [ dict(name='div', attrs={'id':'mainbody'    }) ]
+    remove_tags       = [
+                           dict(name='div'  , attrs={'id':'ads' })
+                          ,dict(name='table', attrs={'width':470})
+                        ]
+
+
+    feeds          = [
+                        (u'The Japan Times', u'http://feedproxy.google.com/japantimes')
+                     ]
--- a/src/calibre/web/feeds/recipes/san_fran_chronicle.py
+++ b/src/calibre/web/feeds/recipes/san_fran_chronicle.py
@ -0,0 +1,29 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
+'''
+sfgate.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class SanFranciscoChronicle(BasicNewsRecipe):
+    title                 = u'San Francisco Chronicle'
+    __author__            = u'Darko Miletic'
+    description           = u'San Francisco news'    
+    oldest_article        = 7
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+
+    keep_only_tags    = [dict(name='td' , attrs={'class':'column1 w627'})]
+    remove_tags_after =  dict(name='div', attrs={'id':'articlecontent' })
+    remove_tags = [
+                     dict(name='div', attrs={'class':'tools tools_top'})
+                    ,dict(name='div', attrs={'id':'articlebox'        })
+                  ]
+
+    feeds          = [  
+                         (u'Top News Stories', u'http://www.sfgate.com/rss/feeds/news.xml')
+                     ]
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -122,6 +122,7 @@ class RecursiveFetcher(object, LoggingInterface):
    def get_soup(self, src):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
+        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
        soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
         
        if self.keep_only_tags: