Merge from trunk

2026-06-07 14:35:27 -04:00 · 2010-06-24 07:41:55 +01:00
parent 55b9a96fd8 195a3a9cd1
commit bb6cacd205
12 changed files with 207 additions and 38 deletions
@@ -1,6 +1,6 @@

 __license__   = 'GPL v3'
-__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 lrb.co.uk
 '''
@@ -8,32 +8,38 @@ lrb.co.uk
 from calibre.web.feeds.news import BasicNewsRecipe

 class LondonReviewOfBooks(BasicNewsRecipe):
-    title                 = u'London Review of Books'
-    __author__            = u'Darko Miletic'
-    description           = u'Literary review publishing essay-length book reviews and topical articles on politics, literature, history, philosophy, science and the arts by leading writers and thinkers'
-    category              = 'news, literature, England'
-    publisher             = 'London Review of Books'
-    oldest_article        = 7
+    title                 = 'London Review of Books (free)'
+    __author__            = 'Darko Miletic'
+    description           = 'Literary review publishing essay-length book reviews and topical articles on politics, literature, history, philosophy, science and the arts by leading writers and thinkers'
+    category              = 'news, literature, UK'
+    publisher             = 'LRB ltd.'
+    oldest_article        = 15
    max_articles_per_feed = 100
    language              = 'en_GB'
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf-8'
+    publication_type      = 'magazine'
+    masthead_url          = 'http://www.lrb.co.uk/assets/images/lrb_logo_big.gif'
+    extra_css             = ' body{font-family: Georgia,Palatino,"Palatino Linotype",serif} '

-    conversion_options = {  
+    conversion_options = {
                             'comments'  : description
                            ,'tags'      : category
                            ,'language'  : language
                            ,'publisher' : publisher
                         }
-    
-    keep_only_tags = [dict(name='div' , attrs={'id'   :'main'})]
-    remove_tags = [
-                    dict(name='div' , attrs={'class':['pagetools','issue-nav-controls','nocss']})
-                   ,dict(name='div' , attrs={'id'   :['mainmenu','precontent','otherarticles']     })
-                   ,dict(name='span', attrs={'class':['inlineright','article-icons']})
-                   ,dict(name='ul'  , attrs={'class':'article-controls'})
-                   ,dict(name='p'   , attrs={'class':'meta-info'       })
-                  ]
+
+    keep_only_tags = [dict(attrs={'class':['article-body indent','letters','article-list']})]
+    remove_attributes = ['width','height']

    feeds = [(u'London Review of Books', u'http://www.lrb.co.uk/lrbrss.xml')]
+
+    def get_cover_url(self):
+        cover_url = None
+        soup = self.index_to_soup('http://www.lrb.co.uk/')
+        cover_item = soup.find('p',attrs={'class':'cover'})
+        if cover_item:
+           cover_url = 'http://www.lrb.co.uk' + cover_item.a.img['src']
+        return cover_url
+
@@ -0,0 +1,75 @@
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+'''
+lrb.co.uk
+'''
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class LondonReviewOfBooksPayed(BasicNewsRecipe):
+    title                 = 'London Review of Books'
+    __author__            = 'Darko Miletic'
+    description           = 'Subscription content. Literary review publishing essay-length book reviews and topical articles on politics, literature, history, philosophy, science and the arts by leading writers and thinkers'
+    category              = 'news, literature, UK'
+    publisher             = 'LRB Ltd.'
+    max_articles_per_feed = 100
+    language              = 'en_GB'
+    no_stylesheets        = True
+    delay                 = 1
+    use_embedded_content  = False
+    encoding              = 'utf-8'
+    INDEX                 = 'http://www.lrb.co.uk'
+    LOGIN                 = INDEX + '/login'
+    masthead_url          = INDEX + '/assets/images/lrb_logo_big.gif'
+    needs_subscription    = True
+    publication_type      = 'magazine'
+    extra_css             = ' body{font-family: Georgia,Palatino,"Palatino Linotype",serif} '
+
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+        if self.username is not None and self.password is not None:
+            br.open(self.LOGIN)
+            br.select_form(nr=1)
+            br['username'] = self.username
+            br['password'] = self.password
+            br.submit()
+        return br
+
+    def parse_index(self):
+        articles = []
+        soup = self.index_to_soup(self.INDEX)
+        cover_item = soup.find('p',attrs={'class':'cover'})
+        lrbtitle = self.title
+        if  cover_item:
+            self.cover_url = self.INDEX + cover_item.a.img['src']
+            content = self.INDEX + cover_item.a['href']
+            soup2 = self.index_to_soup(content)
+            sitem = soup2.find(attrs={'class':'article-list'})
+            lrbtitle = soup2.head.title.string
+            for item in sitem.findAll('a',attrs={'class':'title'}):
+                description = u''
+                title_prefix = u''
+                feed_link = item
+                if feed_link.has_key('href'):
+                    url   = self.INDEX + feed_link['href']
+                    title = title_prefix + self.tag_to_string(feed_link)
+                    date  = strftime(self.timefmt)
+                    articles.append({
+                                      'title'      :title
+                                     ,'date'       :date
+                                     ,'url'        :url
+                                     ,'description':description
+                                    })
+        return [(lrbtitle, articles)]
+
+    conversion_options = {
+                             'comments'  : description
+                            ,'tags'      : category
+                            ,'language'  : language
+                            ,'publisher' : publisher
+                         }
+
+    keep_only_tags = [dict(name='div' , attrs={'class':['article-body indent','letters']})]
+    remove_attributes = ['width','height']
@@ -172,5 +172,10 @@ class CollectionsBookList(BookList):
        For each book in the booklist for the card oncard, remove it from all
        its current collections, then add it to the collections specified in
        device_collections.
+
+        oncard is None for the main memory, carda for card A, cardb for card B,
+        etc.
+
+        booklist is the object created by the :method:`books` call above.
        '''
        pass
@@ -107,9 +107,21 @@ class CSSPreProcessor(object):

    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')

-    def __call__(self, data):
+    def __call__(self, data, add_namespace=False):
+        from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
        data = self.PAGE_PAT.sub('', data)
-        return data
+        if not add_namespace:
+            return data
+        ans, namespaced = [], False
+        for line in data.splitlines():
+            ll = line.lstrip()
+            if not (namespaced or ll.startswith('@import') or
+                        ll.startswith('@charset')):
+                ans.append(XHTML_CSS_NAMESPACE.strip())
+                namespaced = True
+            ans.append(line)
+
+        return u'\n'.join(ans)

 class HTMLPreProcessor(object):

@@ -20,7 +20,7 @@ from itertools import izip
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.customize.conversion import OptionRecommendation
-from calibre.constants import islinux, isfreebsd
+from calibre.constants import islinux, isfreebsd, iswindows
 from calibre import unicode_path
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
@@ -32,9 +32,14 @@ class Link(object):

    @classmethod
    def url_to_local_path(cls, url, base):
-        path = urlunparse(('', '', url.path, url.params, url.query, ''))
+        path = url.path
+        isabs = False
+        if iswindows and path.startswith('/'):
+            path = path[1:]
+            isabs = True
+        path = urlunparse(('', '', path, url.params, url.query, ''))
        path = unquote(path)
-        if os.path.isabs(path):
+        if isabs or os.path.isabs(path):
            return path
        return os.path.abspath(os.path.join(base, path))

@@ -307,6 +312,7 @@ class HTMLInput(InputFormatPlugin):
            xpath
        from calibre import guess_type
        import cssutils
+        self.OEB_STYLES = OEB_STYLES
        oeb = create_oebbook(log, None, opts, self,
                encoding=opts.input_encoding, populate=False)
        self.oeb = oeb
@@ -371,7 +377,7 @@ class HTMLInput(InputFormatPlugin):
            rewrite_links(item.data, partial(self.resource_adder, base=dpath))

        for item in oeb.manifest.values():
-            if item.media_type in OEB_STYLES:
+            if item.media_type in self.OEB_STYLES:
                dpath = None
                for path, href in self.added_resources.items():
                    if href == item.href:
@@ -409,12 +415,30 @@ class HTMLInput(InputFormatPlugin):
        oeb.container = DirContainer(os.getcwdu(), oeb.log)
        return oeb

+    def link_to_local_path(self, link_, base=None):
+        if not isinstance(link_, unicode):
+            try:
+                link_ = link_.decode('utf-8', 'error')
+            except:
+                self.log.warn('Failed to decode link %r. Ignoring'%link_)
+                return None, None
+        try:
+            l = Link(link_, base if base else os.getcwdu())
+        except:
+            self.log.exception('Failed to process link: %r'%link_)
+            return None, None
+        if l.path is None:
+            # Not a local resource
+            return None, None
+        link = l.path.replace('/', os.sep).strip()
+        frag = l.fragment
+        if not link:
+            return None, None
+        return link, frag

    def resource_adder(self, link_, base=None):
-        link = self.urlnormalize(link_)
-        link, frag = self.urldefrag(link)
-        link = unquote(link).replace('/', os.sep)
-        if not link.strip():
+        link, frag = self.link_to_local_path(link_, base=base)
+        if link is None:
            return link_
        try:
            if base and not os.path.isabs(link):
@@ -442,6 +466,9 @@ class HTMLInput(InputFormatPlugin):

            item = self.oeb.manifest.add(id, href, media_type)
            item.html_input_href = bhref
+            if guessed in self.OEB_STYLES:
+                item.override_css_fetch = partial(
+                        self.css_import_handler, os.path.dirname(link))
            item.data
            self.added_resources[link] = href

@@ -450,7 +477,17 @@ class HTMLInput(InputFormatPlugin):
            nlink = '#'.join((nlink, frag))
        return nlink

-
+    def css_import_handler(self, base, href):
+        link, frag = self.link_to_local_path(href, base=base)
+        if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
+            return (None, None)
+        try:
+            raw = open(link, 'rb').read().decode('utf-8', 'replace')
+            raw = self.oeb.css_preprocessor(raw, add_namespace=True)
+        except:
+            self.log.exception('Failed to read CSS file: %r'%link)
+            return (None, None)
+        return (None, raw)



@@ -17,6 +17,7 @@ from urlparse import urljoin

 from lxml import etree, html
 from cssutils import CSSParser
+from cssutils.css import CSSRule

 import calibre
 from calibre.constants import filesystem_encoding
@@ -762,6 +763,7 @@ class Manifest(object):
            self.href = self.path = urlnormalize(href)
            self.media_type = media_type
            self.fallback = fallback
+            self.override_css_fetch = None
            self.spine_position = None
            self.linear = True
            if loader is None and data is None:
@@ -982,15 +984,40 @@ class Manifest(object):


        def _parse_css(self, data):
+
+            def get_style_rules_from_import(import_rule):
+                ans = []
+                if not import_rule.styleSheet:
+                    return ans
+                rules = import_rule.styleSheet.cssRules
+                for rule in rules:
+                    if rule.type == CSSRule.IMPORT_RULE:
+                        ans.extend(get_style_rules_from_import(rule))
+                    elif rule.type in (CSSRule.FONT_FACE_RULE,
+                            CSSRule.STYLE_RULE):
+                        ans.append(rule)
+                return ans
+
            self.oeb.log.debug('Parsing', self.href, '...')
            data = self.oeb.decode(data)
-            data = self.oeb.css_preprocessor(data)
-            data = XHTML_CSS_NAMESPACE + data
+            data = self.oeb.css_preprocessor(data, add_namespace=True)
            parser = CSSParser(loglevel=logging.WARNING,
-                               fetcher=self._fetch_css,
+                               fetcher=self.override_css_fetch or self._fetch_css,
                               log=_css_logger)
            data = parser.parseString(data, href=self.href)
            data.namespaces['h'] = XHTML_NS
+            import_rules = list(data.cssRules.rulesOfType(CSSRule.IMPORT_RULE))
+            rules_to_append = []
+            insert_index = None
+            for r in data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
+                insert_index = data.cssRules.index(r)
+                break
+            for rule in import_rules:
+                rules_to_append.extend(get_style_rules_from_import(rule))
+            for r in reversed(rules_to_append):
+                data.insertRule(r, index=insert_index)
+            for rule in import_rules:
+                data.deleteRule(rule)
            return data

        def _fetch_css(self, path):
@@ -139,11 +139,18 @@ class EbookIterator(object):
                    if id != -1:
                        families = [unicode(f) for f in QFontDatabase.applicationFontFamilies(id)]
                        if family:
-                            family = family.group(1).strip().replace('"', '')
-                            bad_map[family] = families[0]
-                            if family not in families:
+                            family = family.group(1)
+                            specified_families = [x.strip().replace('"',
+                                '').replace("'", '') for x in family.split(',')]
+                            aliasing_ok = False
+                            for f in specified_families:
+                                bad_map[f] = families[0]
+                                if not aliasing_ok and f in families:
+                                    aliasing_ok = True
+
+                            if not aliasing_ok:
                                prints('WARNING: Family aliasing not fully supported.')
-                                prints('\tDeclared family: %s not in actual families: %s'
+                                prints('\tDeclared family: %r not in actual families: %r'
                                        % (family, families))
                            else:
                                prints('Loaded embedded font:', repr(family))
@@ -1240,6 +1240,8 @@ class DeviceMixin(object): # {{{
        self.card_b_view.reset()

    def _upload_collections(self, job, view):
+        if job.failed:
+            self.device_job_exception(job)
        view.reset()

    def upload_collections(self, booklist, view):
@@ -74,5 +74,3 @@ class TagListEditor(QDialog, Ui_TagListEditor):
            self.to_delete.append(id)
            self.available_tags.takeItem(self.available_tags.row(item))

-    def accept(self):
-        QDialog.accept(self)
@@ -302,7 +302,7 @@ Take your pick:

 Why does |app| show only some of my fonts on OS X?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-|app| embeds fonts in ebook files it creates. E-book files support embedding only TrueType (.ttf) fonts. Most fonts on OS X systems are in .dfont format, thus they cannot be embedded. |app| shows only TrueType fonts founf on your system. You can obtain many TrueType fonts on the web. Simply download the .ttf files and add them to the Library/Fonts directory in your home directory.
+|app| embeds fonts in ebook files it creates. E-book files support embedding only TrueType (.ttf) fonts. Most fonts on OS X systems are in .dfont format, thus they cannot be embedded. |app| shows only TrueType fonts found on your system. You can obtain many TrueType fonts on the web. Simply download the .ttf files and add them to the Library/Fonts directory in your home directory.

 |app| is not starting on Windows?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~