Merge from trunk

2025-07-09 03:04:10 -04:00 · 2012-10-08 20:11:18 +02:00 · 2012-10-08 20:11:18 +02:00 · e0e2a0bf40
commit e0e2a0bf40
parent 695520429a 9b8c6f218e
14 changed files with 209 additions and 75 deletions
--- a/recipes/cosmopolitan_uk.recipe
+++ b/recipes/cosmopolitan_uk.recipe
@ -15,6 +15,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    max_articles_per_feed = 20
    remove_empty_feeds = True
    remove_javascript     = True
+    ignore_duplicate_articles = {'title'}

    preprocess_regexps = [
    (re.compile(r'<!-- Begin tmpl module_competition_offer -->.*?<!-- End tmpl module_competition_offer-->', re.IGNORECASE | re.DOTALL), lambda match: '')]
--- a/recipes/countryfile.recipe
+++ b/recipes/countryfile.recipe
@ -1,11 +1,13 @@
 from calibre import browser
 from calibre.web.feeds.news import BasicNewsRecipe
+import re
+
 class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    title          = u'Countryfile.com'
    #cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
    __author__ = 'Dave Asbury'
    description           = 'The official website of Countryfile Magazine'
-    # last updated 9/9//12
+    # last updated 7/10/12
    language = 'en_GB'
    oldest_article = 30
    max_articles_per_feed = 25
@ -13,12 +15,14 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    no_stylesheets = True
    auto_cleanup = True
    #articles_are_obfuscated = True
+    ignore_duplicate_articles = {'title'}
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.countryfile.com/')
-            cov = soup.find(attrs={'class' : 'imagecache imagecache-160px_wide imagecache-linked imagecache-160px_wide_linked'})
+
+            cov = soup.find(attrs={'width' : '160', 'class' : re.compile('imagecache imagecache-160px_wide')})
            print '******** ',cov,' ***'
            cov2 = str(cov)
-            cov2=cov2[140:223]
+            cov2=cov2[10:101]
            print '******** ',cov2,' ***'
            #cov2='http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/1b_0.jpg'
            # try to get cover - if can't get known cover
@ -40,3 +44,6 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    (u'Country News', u'http://www.countryfile.com/rss/news'),
            (u'Countryside', u'http://www.countryfile.com/rss/countryside'),
            ]
+
+
+
--- a/recipes/der_spiegel.recipe
+++ b/recipes/der_spiegel.recipe
@ -72,7 +72,7 @@ class DerSpiegel(BasicNewsRecipe):
            for article in section.findNextSiblings(['dd','dt']):
                if article.name == 'dt':
                    break
-                link = article.find('a')
+                link = article.find('a', href=True)
                title = self.tag_to_string(link).strip()
                if title in self.empty_articles:
                    continue
--- a/recipes/fhm_uk.recipe
+++ b/recipes/fhm_uk.recipe
@ -1,5 +1,6 @@
 from calibre.web.feeds.news import BasicNewsRecipe

+
 class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    title          = u'FHM UK'
    description = 'Good News for Men.'
@ -7,14 +8,15 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    #   cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
    masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
    __author__ = 'Dave Asbury'
-    # last updated 1/7/12
+    # last updated 7/10/12
    language = 'en_GB'
-    oldest_article = 28
-    max_articles_per_feed = 8
+    oldest_article = 31
+    max_articles_per_feed = 15
    remove_empty_feeds = True
    no_stylesheets = True
    #auto_cleanup = True
   # articles_are_obfuscated = True
+
    keep_only_tags = [
               dict(name='h1'),
               dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}),
@ -28,15 +30,13 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):

    ]
    feeds          = [
-        (u'Homepage 1',u'http://feed43.com/6655867614547036.xml'),
-        (u'Homepage 2',u'http://feed43.com/4167731873103110.xml'),
-        (u'Homepage 3',u'http://feed43.com/7667138788771570.xml'),
-        (u'Homepage 4',u'http://feed43.com/6550421522527341.xml'),
-        (u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
-        (u'Gaming',u'http://feed43.com/6537162612465672.xml'),
-        (u'Girls',u'http://feed43.com/4574262733341068.xml'),# edit link http://feed43.com/feed.html?name=4574262733341068
+                        # repeatable search = </div>{|}<a href="{%}" class="{*}">{%}</a>{|}<p>{*}</p>
+        (u'Homepage',u'http://rss.feedsportal.com/c/375/f/434908/index.rss'),
+        (u'Funny',u'http://rss.feedsportal.com/c/375/f/434910/index.rss'),
+        (u'Girls',u'http://rss.feedsportal.com/c/375/f/434913/index.rss'),
 ]

+
    extra_css = '''
                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
--- a/recipes/nme.recipe
+++ b/recipes/nme.recipe
@ -4,7 +4,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title          = u'New Musical Express Magazine'
    description = 'Author D.Asbury. UK Rock & Pop Mag. '
    __author__ = 'Dave Asbury'
-    # last updated 9/6/12
+    # last updated 7/10/12
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
@ -14,15 +14,13 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    language = 'en_GB'

    def get_cover_url(self):
-          soup = self.index_to_soup('http://www.magazinesdirect.com/categories/mens/tv-and-music/')
-          cov = soup.find(attrs={'title' : 'NME magazine subscriptions'})
-          cov2 = 'http://www.magazinesdirect.com'+cov['src']
-          print '***cov =  ',cov2,' ***'
-
-          cover_url = str(cov2)
+        soup = self.index_to_soup('http://www.nme.com/component/subscribe')
+        cov = soup.find(attrs={'id' : 'magazine_cover'})
+        cov2 = str(cov['src'])
        # print '**** Cov url =*', cover_url,'***'
        #print '**** Cov url =*','http://www.magazinesdirect.com/article_images/articledir_3138/1569221/1_largelisting.jpg','***'

+
        br = browser()
        br.set_handle_redirect(False)
        try:
@ -31,8 +29,8 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
        except:
                cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
        return cover_url
-    masthead_url   = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'

+    masthead_url   = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'

    remove_tags = [
        dict( attrs={'class':'clear_icons'}),
@ -61,9 +59,15 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):


    feeds          = [
-    (u'NME News', u'http://feeds2.feedburner.com/nmecom/rss/newsxml'),
+    (u'NME News', u'http://feeds.feedburner.com/nmecom/rss/newsxml?format=xml'),
    #(u'Reviews', u'http://feeds2.feedburner.com/nme/SdML'),
-    (u'Reviews',u'http://feed43.com/4138608576351646.xml'),
+    (u'Reviews',u'http://feed43.com/1817687144061333.xml'),
                    (u'Bloggs',u'http://feed43.com/3326754333186048.xml'),

    ]
+    extra_css = '''
+                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
+                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
+        '''
--- a/recipes/pvp_online.recipe
+++ b/recipes/pvp_online.recipe
@ -0,0 +1,18 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1344926684(BasicNewsRecipe):
+    title          = u'PVP online'
+    __author__ = 'Krittika Goyal'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    #auto_cleanup = True
+    no_stylesheets         = True
+    use_embedded_content   = False
+    language               = 'en'
+    remove_javascript      = True
+
+    keep_only_tags = [dict(name='div', attrs={'class':'body'})]
+    remove_tags = [dict(name='div', attrs={'class':'prevBg'}),dict(name='div', attrs={'class':'nextBg'}),dict(name='div', attrs={'class':'postMeta'})]
+
+    feeds          = [(u'Comics', u'http://pvponline.com/feed'), ]
+
--- a/recipes/shortlist.recipe
+++ b/recipes/shortlist.recipe
@ -5,13 +5,15 @@ class AdvancedUserRecipe1324663493(BasicNewsRecipe):
    title          = u'Shortlist'
    description = 'Articles From Shortlist.com'
    # I've set oldest article to 7 days as the website updates weekly
-    oldest_article = 7
-    max_articles_per_feed = 12
+    oldest_article = 8
+    max_articles_per_feed = 20
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
+    ignore_duplicate_articles = {'title'}
+
    __author__ = 'Dave Asbury'
-    # last updated 19/5/12
+    # last updated 7/10/12
    language = 'en_GB'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.shortlist.com')
@ -45,17 +47,16 @@ class AdvancedUserRecipe1324663493(BasicNewsRecipe):
                     ]

    feeds          = [
-                               (u'Home carousel',u'http://feed43.com/7106317222455380.xml'),
-                               (u'This Weeks Issue', u'http://feed43.com/0323588208751786.xml'),
-         (u'Cool Stuff',u'http://feed43.com/6253845228768456.xml'),
-                                (u'Style',u'http://feed43.com/7217107577215678.xml'),
-                                (u'Films',u'http://feed43.com/3101308515277265.xml'),
-         (u'Music',u'http://feed43.com/2416400550560162.xml'),
-         (u'TV',u'http://feed43.com/4781172470717123.xml'),
-         (u'Sport',u'http://feed43.com/5303151885853308.xml'),
-         (u'Gaming',u'http://feed43.com/8883764600355347.xml'),
-                                (u'Women',u'http://feed43.com/2648221746514241.xml'),
-                                (u'Instant Improver', u'http://feed43.com/1236541026275417.xml'),
+                               #edit http://feed43.com/feed.html?name=3156308700147005
+                               # repeatable pattern = <h3>{_}<a href="{%}">{%}</a>{*}</h3>
+
+                                (u'This Weeks Issue', u'http://feed43.com/5205766657404804.xml'),
+                                (u'Home Page',u'http://feed43.com/3156308700147005.xml'),
+                                (u'Cool Stuff',u'http://feed43.com/1557051772026706.xml'),
+                                (u'Style',u'http://feed43.com/4168836374571502.xml'),
+                                (u'Entertainment',u'http://feed43.com/4578504030588024.xml'),
+

-    #(u'Articles', u'http://feed43.com/3428534448355545.xml')
    ]
+
+
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -40,6 +40,7 @@ class ANDROID(USBMS):
                       0xca4  : HTC_BCDS,
                       0xca9  : HTC_BCDS,
                       0xcac  : HTC_BCDS,
+                       0xcba  : HTC_BCDS,
                       0xccf  : HTC_BCDS,
                       0xcd6  : HTC_BCDS,
                       0xce5  : HTC_BCDS,
--- a/src/calibre/devices/kobo/driver.py
+++ b/src/calibre/devices/kobo/driver.py
@ -12,19 +12,17 @@ Originally developed by Timothy Legge <timlegge@gmail.com>.
 Extended to support Touch firmware 2.0.0 and later and newer devices by David Forrester <davidfor@internode.on.net>
 '''

-import os, time, calendar
+import os, time
 from contextlib import closing
 from calibre.devices.usbms.books import BookList
 from calibre.devices.usbms.books import CollectionsBookList
 from calibre.devices.kobo.books import KTCollectionsBookList
 from calibre.devices.kobo.books import Book
 from calibre.devices.kobo.books import ImageWrapper
-from calibre.devices.kobo.bookmark import Bookmark
 from calibre.devices.mime import mime_type_ext
 from calibre.devices.usbms.driver import USBMS, debug_print
 from calibre import prints
 from calibre.ptempfile import PersistentTemporaryFile
-
 from calibre.constants import DEBUG
 from calibre.utils.config import prefs

@ -994,6 +992,7 @@ class KOBO(USBMS):
        return USBMS.create_annotations_path(self, mdata)

    def get_annotations(self, path_map):
+        from calibre.devices.kobo.bookmark import Bookmark
        EPUB_FORMATS = [u'epub']
        epub_formats = set(EPUB_FORMATS)

@ -1056,6 +1055,7 @@ class KOBO(USBMS):
        return bookmarked_books

    def generate_annotation_html(self, bookmark):
+        import calendar
        from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
        # Returns <div class="user_annotations"> ... </div>
        #last_read_location = bookmark.last_read_location
--- a/src/calibre/ebooks/odt/input.py
+++ b/src/calibre/ebooks/odt/input.py
@ -6,15 +6,19 @@ __docformat__ = 'restructuredtext en'
 '''
 Convert an ODT file into a Open Ebook
 '''
-import os
+import os, logging

 from lxml import etree
+from cssutils import CSSParser
+from cssutils.css import CSSRule
+
 from odf.odf2xhtml import ODF2XHTML
 from odf.opendocument import load as odLoad
 from odf.draw import Frame as odFrame, Image as odImage
 from odf.namespaces import TEXTNS as odTEXTNS

 from calibre import CurrentDir, walk
+from calibre.ebooks.oeb.base import _css_logger

 class Extract(ODF2XHTML):

@ -29,14 +33,14 @@ class Extract(ODF2XHTML):

    def fix_markup(self, html, log):
        root = etree.fromstring(html)
-        self.epubify_markup(root, log)
        self.filter_css(root, log)
-        self.extract_css(root)
+        self.extract_css(root, log)
+        self.epubify_markup(root, log)
        html = etree.tostring(root, encoding='utf-8',
                xml_declaration=True)
        return html

-    def extract_css(self, root):
+    def extract_css(self, root, log):
        ans = []
        for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'):
            ans.append(s.text)
@ -51,9 +55,21 @@ class Extract(ODF2XHTML):
            etree.SubElement(head, ns+'link', {'type':'text/css',
                'rel':'stylesheet', 'href':'odfpy.css'})

-        with open('odfpy.css', 'wb') as f:
-            f.write((u'\n\n'.join(ans)).encode('utf-8'))
+        css = u'\n\n'.join(ans)
+        parser = CSSParser(loglevel=logging.WARNING,
+                            log=_css_logger)
+        self.css = parser.parseString(css, validate=False)

+        with open('odfpy.css', 'wb') as f:
+            f.write(css.encode('utf-8'))
+
+    def get_css_for_class(self, cls):
+        if not cls: return None
+        for rule in self.css.cssRules.rulesOfType(CSSRule.STYLE_RULE):
+            for sel in rule.selectorList:
+                q = sel.selectorText
+                if q == '.' + cls:
+                    return rule

    def epubify_markup(self, root, log):
        from calibre.ebooks.oeb.base import XPath, XHTML
@ -84,16 +100,54 @@ class Extract(ODF2XHTML):
                div.attrib['style'] = style
                img.attrib['style'] = 'max-width: 100%; max-height: 100%'

-        # A div/div/img construct causes text-align:center to not work in ADE
-        # so set the display of the second div to inline. This should have no
-        # effect (apart from minor vspace issues) in a compliant HTML renderer
-        # but it fixes the centering of the image via a text-align:center on
-        # the first div in ADE
+        # Handle anchored images. The default markup + CSS produced by
+        # odf2xhtml works with WebKit but not with ADE. So we convert the
+        # common cases of left/right/center aligned block images to work on
+        # both webkit and ADE. We detect the case of setting the side margins
+        # to auto and map it to an appropriate text-align directive, which
+        # works in both WebKit and ADE.
+        # https://bugs.launchpad.net/bugs/1063207
+        # https://bugs.launchpad.net/calibre/+bug/859343
        imgpath = XPath('descendant::h:div/h:div/h:img')
        for img in imgpath(root):
            div2 = img.getparent()
            div1 = div2.getparent()
-            if len(div1) == len(div2) == 1:
+            if (len(div1), len(div2)) != (1, 1): continue
+            cls = div1.get('class', '')
+            first_rules = filter(None, [self.get_css_for_class(x) for x in
+                cls.split()])
+            has_align = False
+            for r in first_rules:
+                if r.style.getProperty(u'text-align') is not None:
+                    has_align = True
+            ml = mr = None
+            if not has_align:
+                aval = None
+                cls = div2.get(u'class', u'')
+                rules = filter(None, [self.get_css_for_class(x) for x in
+                    cls.split()])
+                for r in rules:
+                    ml = r.style.getPropertyCSSValue(u'margin-left') or ml
+                    mr = r.style.getPropertyCSSValue(u'margin-right') or mr
+                    ml = getattr(ml, 'value', None)
+                    mr = getattr(mr, 'value', None)
+                if ml == mr == u'auto':
+                    aval = u'center'
+                elif ml == u'auto' and mr != u'auto':
+                    aval = 'right'
+                elif ml != u'auto' and mr == u'auto':
+                    aval = 'left'
+                if aval is not None:
+                    style = div1.attrib.get('style', '').strip()
+                    if style and not style.endswith(';'):
+                        style = style + ';'
+                    style += 'text-align:%s'%aval
+                    has_align = True
+                    div1.attrib['style'] = style
+
+            if has_align:
+                # This is needed for ADE, without it the text-align has no
+                # effect
                style = div2.attrib['style']
                div2.attrib['style'] = 'display:inline;'+style

--- a/src/calibre/utils/smtp.py
+++ b/src/calibre/utils/smtp.py
@ -211,23 +211,25 @@ def main(args=sys.argv):
        msg = compose_mail(args[1], args[2], args[3], subject=opts.subject,
                           attachment=opts.attachment)
        from_, to = args[1:3]
-        efrom, eto = map(extract_email_address, (from_, to))
-        eto = [eto]
+        eto = [extract_email_address(x.strip()) for x in to.split(',')]
+        efrom = extract_email_address(from_)
    else:
        msg = sys.stdin.read()
-        from email.parser import Parser
+        from email import message_from_string
        from email.utils import getaddresses
-        eml = Parser.parsestr(msg, headersonly=True)
+        eml = message_from_string(msg)
        tos = eml.get_all('to', [])
-        ccs = eml.get_all('cc', [])
-        eto = getaddresses(tos + ccs)
+        ccs = eml.get_all('cc', []) + eml.get_all('bcc', [])
+        all_tos = []
+        for x in tos + ccs:
+            all_tos.extend(y.strip() for y in x.split(','))
+        eto = list(map(extract_email_address, all_tos))
        if not eto:
            raise ValueError('Email from STDIN does not specify any recipients')
        efrom = getaddresses(eml.get_all('from', []))
        if not efrom:
            raise ValueError('Email from STDIN does not specify a sender')
-        efrom = efrom[0]
-
+        efrom = efrom[0][1]

    outbox = None
    if opts.outbox is not None:
--- a/src/calibre/web/feeds/init.py
+++ b/src/calibre/web/feeds/init.py
@ -265,6 +265,12 @@ class Feed(object):
        if i > -1:
            self.articles[i:i+1] = []

+    def remove_article(self, article):
+        try:
+            self.articles.remove(article)
+        except ValueError:
+            pass
+
 class FeedCollection(list):

    def __init__(self, feeds):
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -167,9 +167,10 @@ class BasicNewsRecipe(Recipe):
    extra_css              = None

    #: If True empty feeds are removed from the output.
-    #: This option has no effect if parse_index is overriden in
+    #: This option has no effect if parse_index is overridden in
    #: the sub class. It is meant only for recipes that return a list
-    #: of feeds using `feeds` or :meth:`get_feeds`.
+    #: of feeds using `feeds` or :meth:`get_feeds`. It is also used if you use
+    #: the ignore_duplicate_articles option.
    remove_empty_feeds = False

    #: List of regular expressions that determines which links to follow
@ -321,6 +322,15 @@ class BasicNewsRecipe(Recipe):
    #: The string will be used as the disabled message
    recipe_disabled = None

+    #: Ignore duplicates of articles that are present in more than one section.
+    #: A duplicate article is an article that has the same title and/or URL.
+    #: To ignore articles with the same title, set this to:
+    #: ignore_duplicate_articles = {'title'}
+    #: To use URLs instead, set it to:
+    #: ignore_duplicate_articles = {'url'}
+    #: To match on title or URL, set it to:
+    #: ignore_duplicate_articles = {'title', 'url'}
+    ignore_duplicate_articles = None

    # See the built-in profiles for examples of these settings.

@ -1019,6 +1029,28 @@ class BasicNewsRecipe(Recipe):
            url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
        return self._fetch_article(url, dir,  f, a, num_of_feeds)

+    def remove_duplicate_articles(self, feeds):
+        seen_keys = defaultdict(set)
+        remove = []
+        for f in feeds:
+            for article in f:
+                for key in self.ignore_duplicate_articles:
+                    val = getattr(article, key)
+                    seen = seen_keys[key]
+                    if val:
+                        if val in seen:
+                            remove.append((f, article))
+                        else:
+                            seen.add(val)
+
+        for feed, article in remove:
+            self.log.debug('Removing duplicate article: %s from section: %s'%(
+                article.title, feed.title))
+            feed.remove_article(article)
+
+        if self.remove_empty_feeds:
+            feeds = [f for f in feeds if len(f) > 0]
+        return feeds

    def build_index(self):
        self.report_progress(0, _('Fetching feeds...'))
@ -1033,6 +1065,9 @@ class BasicNewsRecipe(Recipe):
        if not feeds:
            raise ValueError('No articles found, aborting')

+        if self.ignore_duplicate_articles is not None:
+            feeds = self.remove_duplicate_articles(feeds)
+
        #feeds = FeedCollection(feeds)

        self.report_progress(0, _('Trying to download cover...'))
--- a/src/calibre/web/feeds/recipes/collection.py
+++ b/src/calibre/web/feeds/recipes/collection.py
@ -68,7 +68,12 @@ def serialize_collection(mapping_of_recipe_classes):
            key=lambda key: force_unicode(
                getattr(mapping_of_recipe_classes[key], 'title', 'zzz'),
                'utf-8')):
+        try:
            recipe = serialize_recipe(urn, mapping_of_recipe_classes[urn])
+        except:
+            import traceback
+            traceback.print_exc()
+            continue
        collection.append(recipe)
    collection.set('count', str(len(collection)))
    return etree.tostring(collection, encoding='utf-8', xml_declaration=True,