Misc. fixes

2025-07-09 03:04:10 -04:00 · 2009-07-15 12:36:50 -06:00 · 2009-07-15 12:36:50 -06:00 · 656c55debf
commit 656c55debf
parent eb625d37c3
5 changed files with 84 additions and 137 deletions
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -24,7 +24,7 @@ class ANDROID(USBMS):
            ]
    PRODUCT_ID  = [0x0c02]
    BCD         = [0x100]
-    EBOOK_DIR_MAIN = 'wordplayer/calibre'
+    EBOOK_DIR_MAIN = 'wordplayer/calibretransfer'

    VENDOR_NAME      = 'HTC'
    WINDOWS_MAIN_MEM = 'ANDROID_PHONE'
--- a/src/calibre/ebooks/chardet/init.py
+++ b/src/calibre/ebooks/chardet/init.py
@ -3,12 +3,12 @@
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
-# 
+#
 # This library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU Lesser General Public
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
@ -30,9 +30,9 @@ def detect(aBuf):

 # Added by Kovid
 ENCODING_PATS = [
-                 re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', 
+                 re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>',
                            re.IGNORECASE),
-                 re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', 
+                 re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>',
                            re.IGNORECASE)
                 ]
 ENTITY_PATTERN = re.compile(r'&(\S+?);')
@ -51,7 +51,7 @@ def substitute_entites(raw):

 _CHARSET_ALIASES = { "macintosh" : "mac-roman",
                        "x-sjis" : "shift-jis" }
-    
+

 def force_encoding(raw, verbose):
    from calibre.constants import preferred_encoding
@ -70,19 +70,19 @@ def force_encoding(raw, verbose):
    if encoding == 'ascii':
        encoding = 'utf-8'
    return encoding
-        

-def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, 
+
+def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
                   resolve_entities=False):
    '''
-    Force conversion of byte string to unicode. Tries to look for XML/HTML 
+    Force conversion of byte string to unicode. Tries to look for XML/HTML
    encoding declaration first, if not found uses the chardet library and
    prints a warning if detection confidence is < 100%
-    @return: (unicode, encoding used) 
+    @return: (unicode, encoding used)
    '''
    encoding = None
    if not raw:
-        return u'', encoding    
+        return u'', encoding
    if not isinstance(raw, unicode):
        if raw.startswith('\xff\xfe'):
            raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
@ -103,10 +103,10 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
        except LookupError:
            encoding = 'utf-8'
            raw = raw.decode(encoding, 'replace')
-    
+
    if strip_encoding_pats:
        raw = strip_encoding_declarations(raw)
    if resolve_entities:
        raw = substitute_entites(raw)
-        
-    return raw, encoding 
+
+    return raw, encoding
--- a/src/calibre/gui2/tools.py
+++ b/src/calibre/gui2/tools.py
@ -163,7 +163,7 @@ def fetch_scheduled_recipe(recipe, script):
            OptionRecommendation.HIGH))
    lf = load_defaults('look_and_feel')
    if lf.get('base_font_size', 0.0) != 0.0:
-        recs.append(('base_font_size', ps['base_font_size'],
+        recs.append(('base_font_size', lf['base_font_size'],
            OptionRecommendation.HIGH))

    args = [script, pt.name, recs]
--- a/src/calibre/library/database.py
+++ b/src/calibre/library/database.py
@ -1015,7 +1015,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;

    def books_in_series_of(self, index, index_is_id=False):
        '''
-        Return an ordered list of all books in the series that the book indetified by index belongs to.
+        Return an ordered list of all books in the series that the book identified by index belongs to.
        If the book does not belong to a series return an empty list. The list contains book ids.
        '''
        series_id = self.series_id(index, index_is_id=index_is_id)
--- a/src/calibre/web/feeds/recipes/recipe_nytimes.py
+++ b/src/calibre/web/feeds/recipes/recipe_nytimes.py
@ -16,32 +16,62 @@ class NYTimes(BasicNewsRecipe):
    __author__  = 'GRiker'
    language = _('English')
    description = 'Top Stories from the New York Times'
-    #max_articles_per_feed = 3
+    
+    # List of sections typically included in Top Stories.  Use a keyword from the
+    # right column in the excludeSectionKeywords[] list to skip downloading that section
+    sections = {
+                 'arts'             :   'Arts',
+                 'business'         :   'Business',
+                 'diningwine'       :   'Dining & Wine',
+                 'editorials'       :   'Editorials',
+                 'health'           :   'Health',
+                 'magazine'         :   'Magazine',
+                 'mediaadvertising' :   'Media & Advertising',
+                 'newyorkregion'    :   'New York/Region',
+                 'oped'             :   'Op-Ed',
+                 'politics'         :   'Politics',
+                 'science'          :   'Science',
+                 'sports'           :   'Sports',
+                 'technology'       :   'Technology',
+                 'topstories'       :   'Top Stories',
+                 'travel'           :   'Travel',
+                 'us'               :   'U.S.',
+                 'world'            :   'World'
+               }
+
+    # By default, no sections are skipped.  
+    excludeSectionKeywords = []
+
+    # Add section keywords from the right column above to skip that section
+    # For example, to skip sections containing the word 'Sports' or 'Dining', use:
+    # excludeSectionKeywords = ['Sports', 'Dining']
+    # Fetch only Business and Technology
+    #excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
+    # Fetch only Top Stories
+    #excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
+    
+    # The maximum number of articles that will be downloaded
+    max_articles_per_feed = 50
+
    timefmt = ''
    needs_subscription = True
    remove_tags_after  = dict(attrs={'id':['comments']})
-    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', 
+    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink',
                               'clearfix', 'nextArticleLink clearfix','inlineSearchControl',
                               'columnGroup','entry-meta','entry-response module','jumpLink','nav',
                               'columnGroup advertisementColumnGroup', 'kicker entry-category']}),
-                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 
+                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive',
                            'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login',
                            'blog-header','searchForm','NYTLogo','insideNYTimes','adxToolSponsor',
                            'adxLeaderboard']),
                   dict(name=['script', 'noscript', 'style','hr'])]
    encoding = 'cp1252'
    no_stylesheets = True
-    #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
    extra_css = '.headline  {text-align:left;}\n\
                 .byline    {font:monospace; margin-bottom:0px;}\n\
                 .source    {align:left;}\n\
                 .credit    {align:right;}\n'

-
-    flatPeriodical = True
-    feed = None
-    ans = []
-
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
@ -54,14 +84,8 @@ class NYTimes(BasicNewsRecipe):

    def index_to_soup(self, url_or_raw, raw=False):
        '''
-        Convenience method that takes an URL to the index page and returns
-        a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
-        of it.
-        
-        This is an OVERRIDE of the method provided in news.py to solve an encoding problem
-        with NYTimes index pages which seem to be encoded in a wonderful blend
-
-        `url_or_raw`: Either a URL or the downloaded index page as a string
+        OVERRIDE of class method
+        deals with various page encodings between index and articles
        '''
        def get_the_soup(docEncoding, url_or_raw, raw=False) :
            if re.match(r'\w+://', url_or_raw):
@ -88,58 +112,18 @@ class NYTimes(BasicNewsRecipe):
        if docEncoding == '' :
            docEncoding = self.encoding

-        if self.verbose :
-            self.log( "  document encoding: '%s'" % docEncoding)
        if docEncoding != self.encoding :
-            soup = get_the_soup(docEncoding, url_or_raw)         
+            soup = get_the_soup(docEncoding, url_or_raw)

        return soup

    def parse_index(self):
        articles = {}
+        ans = []

-        if self.flatPeriodical :
-            self.feed = key = 'All Top Stories'
-            articles[key] = []
-            self.ans.append(key)
-        else :
-            key = None
-
-        '''
-        def feed_title(div):
-            return ''.join(div.findAll(text=True, recursive=False)).strip()
-        '''
-        
-
-        sections = {
-                     'arts'             :   'Arts',
-                     'business'         :   'Business',
-                     'editorials'       :   'Editorials',
-                     'health'           :   'Health',
-                     'magazine'         :   'Magazine',
-                     'mediaadvertising' :   'Media & Advertising',
-                     'newyorkregion'    :   'New York/Region',
-                     'oped'             :   'Op-Ed',
-                     'politics'         :   'Politics',
-                     'science'          :   'Science',
-                     'sports'           :   'Sports',
-                     'technology'       :   'Technology',
-                     'topstories'       :   'Top Stories',
-                     'travel'           :   'Travel',
-                     'us'               :   'U.S.',
-                     'world'            :   'World'
-                   }
-
-        '''
-        excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media',
-                                   'New York','Op-Ed','Politics','Science','Sports','Technology',
-                                   'Top Stories','Travel','U.S.','World']
-        '''                                   
-        excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media',
-                                   'New York','Politics','Science','Sports','Technology',
-                                   'Top Stories','Travel','U.S.','World']
-        
-        #excludeSectionKeywords = []
+        feed = key = 'All Top Stories'
+        articles[key] = []
+        ans.append(key)
        
        soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')

@ -152,35 +136,25 @@ class NYTimes(BasicNewsRecipe):
        while True :
            table = table.find('table')
            if table.find(text=re.compile('top stories start')) :
-                if self.verbose > 2 : self.log( "*********** dropping one level deeper **************")
                previousTable = table
                continue
            else :
-                if self.verbose > 2 : self.log( "found table with top stories")
                table = previousTable
-                if self.verbose > 2 : self.log( "lowest table containing 'top stories start:\n%s" % table)
                break

        # There are multiple subtables, find the one containing the stories
        for block in table.findAll('table') :
            if block.find(text=re.compile('top stories start')) :
-                if self.verbose > 2 : self.log( "found subtable with top stories")
                table = block
-                if self.verbose > 2 : self.log( "lowest subtable containing 'top stories start:\n%s" % table)
                break
            else :
-                if self.verbose > 2 : self.log( "trying next subtable")
                continue

        # Again there are multiple subtables, find the one containing the stories
        for storyblock in table.findAll('table') :
            if storyblock.find(text=re.compile('top stories start')) :
-                if self.verbose > 2 : self.log( "found subsubtable with top stories\n" )
-                # table = storyblock
-                if self.verbose > 2 : self.log( "\nlowest subsubtable containing 'top stories start:\n%s" % storyblock)
                break
            else :
-                if self.verbose > 2 : self.log( "trying next subsubtable")
                continue

        skipThisSection = False
@ -192,7 +166,6 @@ class NYTimes(BasicNewsRecipe):
                sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
                                                         'times new roman,times, sans serif',
                                                         'times new roman, times, sans serif']})
-                if self.verbose > 2 : self.log( "----------- new tr ----------------")
                section = None
                bylines = []
                descriptions = []
@ -205,26 +178,20 @@ class NYTimes(BasicNewsRecipe):
                    if ('Comment' in str(i.__class__)) :
                        if 'start(name=' in i :
                            section = i[i.find('=')+1:-2]
-                            if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section])

-                        if not sections.has_key(section) :
-                            self.log( "Unrecognized section id: %s, skipping" % section )
+                        if not self.sections.has_key(section) :
                            skipThisSection = True
                            break

                        # Check for excluded section
-                        if len(excludeSectionKeywords):
-                            key = sections[section]
-                            excluded = re.compile('|'.join(excludeSectionKeywords))
+                        if len(self.excludeSectionKeywords):
+                            key = self.sections[section]
+                            excluded = re.compile('|'.join(self.excludeSectionKeywords))
                            if excluded.search(key) or articles.has_key(key):
-                                if self.verbose > 2 : self.log("Skipping section %s" % key)
+                                if self.verbose : self.log("Skipping section %s" % key)
                                skipThisSection = True
                                break

-                        if not self.flatPeriodical :
-                            articles[key] = []
-                            self.ans.append(key)
-
                # Get the bylines and descriptions
                if not skipThisSection :
                    for (x,i) in enumerate(sectionblock.contents) :
@ -248,31 +215,26 @@ class NYTimes(BasicNewsRecipe):
                            #continue
                        url = re.sub(r'\?.*', '', a['href'])
                        url += '?pagewanted=all'
+
                        title = self.tag_to_string(a, use_alt=True)
-                        if self.flatPeriodical :
-                            # prepend the section name
-                            title = sections[section] + " &middot; " + title
+                        # prepend the section name
+                        title = self.sections[section] + " &middot; " + title
+
                        if not isinstance(title, unicode):
                            title = title.decode('utf-8', 'replace')
+
                        description = descriptions[i]
+
                        if len(bylines) == articleCount :
                            author = bylines[i]
                        else :
                            author = None

-                        if self.verbose > 2 : self.log( "      title: %s" % title)
-                        if self.verbose > 2 : self.log( "        url: %s" % url)
-                        if self.verbose > 2 : self.log( "     author: %s" % author)
-                        if self.verbose > 2 : self.log( "description: %s" % description)
-
-                        if not self.flatPeriodical :
-                            self.feed = key
-
                        # Check for duplicates
                        duplicateFound = False
-                        if self.flatPeriodical and len(articles[self.feed]) > 1:
-                            #print articles[self.feed]
-                            for article in articles[self.feed] :
+                        if len(articles[feed]) > 1:
+                            #print articles[feed]
+                            for article in articles[feed] :
                                #print "comparing %s\n %s\n" % (url, article['url'])
                                if url == article['url'] :
                                    duplicateFound = True
@ -280,23 +242,18 @@ class NYTimes(BasicNewsRecipe):
                            #print
                            
                            if duplicateFound:        
-                                # Continue fetching, don't add this article
-                                print "  skipping duplicate %s" % article['url']
                                continue        

-                        if not articles.has_key(self.feed):
-                            if self.verbose > 2 : self.log( "adding %s to articles[]" % self.feed)
-                            articles[self.feed] = []
-                        if self.verbose > 2 : self.log( "     adding: %s to articles[%s]\n" % (title, self.feed))
-                        articles[self.feed].append(
+                        if not articles.has_key(feed):
+                            articles[feed] = []
+                        articles[feed].append(
                            dict(title=title, url=url, date=pubdate,
                                 description=description, author=author, content=''))

-        self.ans = self.sort_index_by(self.ans, {'Top Stories':-1})
-        self.ans = [(key, articles[key]) for key in self.ans if articles.has_key(key)]
-        #sys.exit(1)
-
-        return self.ans
+        ans = self.sort_index_by(ans, {'Top Stories':-1})
+        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+        
+        return ans

    def preprocess_html(self, soup):
        refresh = soup.find('meta', {'http-equiv':'refresh'})
@ -307,12 +264,9 @@ class NYTimes(BasicNewsRecipe):
        return BeautifulSoup(raw.decode('cp1252', 'replace'))

    def postprocess_html(self,soup, True):
-        if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ")
        # Change class="kicker" to <h3>
        kicker = soup.find(True, {'class':'kicker'})
        if kicker is not None :
-            print "changing kicker to <h3>"
-            print kicker
            h3Tag = Tag(soup, "h3")
            h3Tag.insert(0, kicker.contents[0])
            kicker.replaceWith(h3Tag)
@ -345,13 +299,7 @@ class NYTimes(BasicNewsRecipe):
            tag = Tag(soup, "h3")
            tag.insert(0, masthead.contents[0])
            soup.h1.replaceWith(tag)
-        '''
-        # Change subheads to <h3>
-        for subhead in soup.findAll(True, {'class':'bold'}) :
-            h3Tag = Tag(soup, "h3")
-            h3Tag.insert(0, subhead.contents[0])
-            subhead.replaceWith(h3Tag)
-        '''
+
        # Change <span class="bold"> to <b>
        for subhead in soup.findAll(True, {'class':'bold'}) :
            bTag = Tag(soup, "b")
@ -359,4 +307,3 @@ class NYTimes(BasicNewsRecipe):
            subhead.replaceWith(bTag)

        return soup
-