diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index dafd67a4ad..98142b6b02 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -24,7 +24,7 @@ class ANDROID(USBMS): ] PRODUCT_ID = [0x0c02] BCD = [0x100] - EBOOK_DIR_MAIN = 'wordplayer/calibre' + EBOOK_DIR_MAIN = 'wordplayer/calibretransfer' VENDOR_NAME = 'HTC' WINDOWS_MAIN_MEM = 'ANDROID_PHONE' diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py index 2e8cdcb67c..975ffc1331 100644 --- a/src/calibre/ebooks/chardet/__init__.py +++ b/src/calibre/ebooks/chardet/__init__.py @@ -3,12 +3,12 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA @@ -30,9 +30,9 @@ def detect(aBuf): # Added by Kovid ENCODING_PATS = [ - re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', + re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), - re.compile(r'', + re.compile(r'', re.IGNORECASE) ] ENTITY_PATTERN = re.compile(r'&(\S+?);') @@ -51,7 +51,7 @@ def substitute_entites(raw): _CHARSET_ALIASES = { "macintosh" : "mac-roman", "x-sjis" : "shift-jis" } - + def force_encoding(raw, verbose): from calibre.constants import preferred_encoding @@ -70,19 +70,19 @@ def force_encoding(raw, verbose): if encoding == 'ascii': encoding = 'utf-8' return encoding - -def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, + +def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entities=False): ''' - Force conversion of byte string to unicode. Tries to look for XML/HTML + Force conversion of byte string to unicode. Tries to look for XML/HTML encoding declaration first, if not found uses the chardet library and prints a warning if detection confidence is < 100% - @return: (unicode, encoding used) + @return: (unicode, encoding used) ''' encoding = None if not raw: - return u'', encoding + return u'', encoding if not isinstance(raw, unicode): if raw.startswith('\xff\xfe'): raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le' @@ -103,10 +103,10 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, except LookupError: encoding = 'utf-8' raw = raw.decode(encoding, 'replace') - + if strip_encoding_pats: raw = strip_encoding_declarations(raw) if resolve_entities: raw = substitute_entites(raw) - - return raw, encoding + + return raw, encoding diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index 6b395dac79..d146f2ab5e 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -163,7 +163,7 @@ def fetch_scheduled_recipe(recipe, script): OptionRecommendation.HIGH)) lf = load_defaults('look_and_feel') if lf.get('base_font_size', 0.0) != 0.0: - recs.append(('base_font_size', ps['base_font_size'], + recs.append(('base_font_size', lf['base_font_size'], OptionRecommendation.HIGH)) args = [script, pt.name, recs] diff --git a/src/calibre/library/database.py b/src/calibre/library/database.py index ed92853df2..d0d8b1aa61 100644 --- a/src/calibre/library/database.py +++ b/src/calibre/library/database.py @@ -1015,7 +1015,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE; def books_in_series_of(self, index, index_is_id=False): ''' - Return an ordered list of all books in the series that the book indetified by index belongs to. + Return an ordered list of all books in the series that the book identified by index belongs to. If the book does not belong to a series return an empty list. The list contains book ids. ''' series_id = self.series_id(index, index_is_id=index_is_id) diff --git a/src/calibre/web/feeds/recipes/recipe_nytimes.py b/src/calibre/web/feeds/recipes/recipe_nytimes.py index 04313a1172..ce7cf20e4f 100644 --- a/src/calibre/web/feeds/recipes/recipe_nytimes.py +++ b/src/calibre/web/feeds/recipes/recipe_nytimes.py @@ -16,32 +16,62 @@ class NYTimes(BasicNewsRecipe): __author__ = 'GRiker' language = _('English') description = 'Top Stories from the New York Times' - #max_articles_per_feed = 3 + + # List of sections typically included in Top Stories. Use a keyword from the + # right column in the excludeSectionKeywords[] list to skip downloading that section + sections = { + 'arts' : 'Arts', + 'business' : 'Business', + 'diningwine' : 'Dining & Wine', + 'editorials' : 'Editorials', + 'health' : 'Health', + 'magazine' : 'Magazine', + 'mediaadvertising' : 'Media & Advertising', + 'newyorkregion' : 'New York/Region', + 'oped' : 'Op-Ed', + 'politics' : 'Politics', + 'science' : 'Science', + 'sports' : 'Sports', + 'technology' : 'Technology', + 'topstories' : 'Top Stories', + 'travel' : 'Travel', + 'us' : 'U.S.', + 'world' : 'World' + } + + # By default, no sections are skipped. + excludeSectionKeywords = [] + + # Add section keywords from the right column above to skip that section + # For example, to skip sections containing the word 'Sports' or 'Dining', use: + # excludeSectionKeywords = ['Sports', 'Dining'] + # Fetch only Business and Technology + #excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World'] + # Fetch only Top Stories + #excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World'] + + # The maximum number of articles that will be downloaded + max_articles_per_feed = 50 + timefmt = '' needs_subscription = True remove_tags_after = dict(attrs={'id':['comments']}) - remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', 'clearfix', 'nextArticleLink clearfix','inlineSearchControl', 'columnGroup','entry-meta','entry-response module','jumpLink','nav', 'columnGroup advertisementColumnGroup', 'kicker entry-category']}), - dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', + dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login', 'blog-header','searchForm','NYTLogo','insideNYTimes','adxToolSponsor', 'adxLeaderboard']), dict(name=['script', 'noscript', 'style','hr'])] encoding = 'cp1252' no_stylesheets = True - #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' extra_css = '.headline {text-align:left;}\n\ .byline {font:monospace; margin-bottom:0px;}\n\ .source {align:left;}\n\ .credit {align:right;}\n' - - flatPeriodical = True - feed = None - ans = [] - def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: @@ -54,14 +84,8 @@ class NYTimes(BasicNewsRecipe): def index_to_soup(self, url_or_raw, raw=False): ''' - Convenience method that takes an URL to the index page and returns - a `BeautifulSoup `_ - of it. - - This is an OVERRIDE of the method provided in news.py to solve an encoding problem - with NYTimes index pages which seem to be encoded in a wonderful blend - - `url_or_raw`: Either a URL or the downloaded index page as a string + OVERRIDE of class method + deals with various page encodings between index and articles ''' def get_the_soup(docEncoding, url_or_raw, raw=False) : if re.match(r'\w+://', url_or_raw): @@ -88,58 +112,18 @@ class NYTimes(BasicNewsRecipe): if docEncoding == '' : docEncoding = self.encoding - if self.verbose : - self.log( " document encoding: '%s'" % docEncoding) if docEncoding != self.encoding : - soup = get_the_soup(docEncoding, url_or_raw) + soup = get_the_soup(docEncoding, url_or_raw) return soup def parse_index(self): articles = {} + ans = [] - if self.flatPeriodical : - self.feed = key = 'All Top Stories' - articles[key] = [] - self.ans.append(key) - else : - key = None - - ''' - def feed_title(div): - return ''.join(div.findAll(text=True, recursive=False)).strip() - ''' - - - sections = { - 'arts' : 'Arts', - 'business' : 'Business', - 'editorials' : 'Editorials', - 'health' : 'Health', - 'magazine' : 'Magazine', - 'mediaadvertising' : 'Media & Advertising', - 'newyorkregion' : 'New York/Region', - 'oped' : 'Op-Ed', - 'politics' : 'Politics', - 'science' : 'Science', - 'sports' : 'Sports', - 'technology' : 'Technology', - 'topstories' : 'Top Stories', - 'travel' : 'Travel', - 'us' : 'U.S.', - 'world' : 'World' - } - - ''' - excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media', - 'New York','Op-Ed','Politics','Science','Sports','Technology', - 'Top Stories','Travel','U.S.','World'] - ''' - excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media', - 'New York','Politics','Science','Sports','Technology', - 'Top Stories','Travel','U.S.','World'] - - #excludeSectionKeywords = [] + feed = key = 'All Top Stories' + articles[key] = [] + ans.append(key) soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') @@ -152,35 +136,25 @@ class NYTimes(BasicNewsRecipe): while True : table = table.find('table') if table.find(text=re.compile('top stories start')) : - if self.verbose > 2 : self.log( "*********** dropping one level deeper **************") previousTable = table continue else : - if self.verbose > 2 : self.log( "found table with top stories") table = previousTable - if self.verbose > 2 : self.log( "lowest table containing 'top stories start:\n%s" % table) break # There are multiple subtables, find the one containing the stories for block in table.findAll('table') : if block.find(text=re.compile('top stories start')) : - if self.verbose > 2 : self.log( "found subtable with top stories") table = block - if self.verbose > 2 : self.log( "lowest subtable containing 'top stories start:\n%s" % table) break else : - if self.verbose > 2 : self.log( "trying next subtable") continue # Again there are multiple subtables, find the one containing the stories for storyblock in table.findAll('table') : if storyblock.find(text=re.compile('top stories start')) : - if self.verbose > 2 : self.log( "found subsubtable with top stories\n" ) - # table = storyblock - if self.verbose > 2 : self.log( "\nlowest subsubtable containing 'top stories start:\n%s" % storyblock) break else : - if self.verbose > 2 : self.log( "trying next subsubtable") continue skipThisSection = False @@ -192,7 +166,6 @@ class NYTimes(BasicNewsRecipe): sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif', 'times new roman,times, sans serif', 'times new roman, times, sans serif']}) - if self.verbose > 2 : self.log( "----------- new tr ----------------") section = None bylines = [] descriptions = [] @@ -205,26 +178,20 @@ class NYTimes(BasicNewsRecipe): if ('Comment' in str(i.__class__)) : if 'start(name=' in i : section = i[i.find('=')+1:-2] - if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section]) - if not sections.has_key(section) : - self.log( "Unrecognized section id: %s, skipping" % section ) + if not self.sections.has_key(section) : skipThisSection = True break # Check for excluded section - if len(excludeSectionKeywords): - key = sections[section] - excluded = re.compile('|'.join(excludeSectionKeywords)) + if len(self.excludeSectionKeywords): + key = self.sections[section] + excluded = re.compile('|'.join(self.excludeSectionKeywords)) if excluded.search(key) or articles.has_key(key): - if self.verbose > 2 : self.log("Skipping section %s" % key) + if self.verbose : self.log("Skipping section %s" % key) skipThisSection = True break - if not self.flatPeriodical : - articles[key] = [] - self.ans.append(key) - # Get the bylines and descriptions if not skipThisSection : for (x,i) in enumerate(sectionblock.contents) : @@ -248,31 +215,26 @@ class NYTimes(BasicNewsRecipe): #continue url = re.sub(r'\?.*', '', a['href']) url += '?pagewanted=all' + title = self.tag_to_string(a, use_alt=True) - if self.flatPeriodical : - # prepend the section name - title = sections[section] + " · " + title + # prepend the section name + title = self.sections[section] + " · " + title + if not isinstance(title, unicode): title = title.decode('utf-8', 'replace') + description = descriptions[i] + if len(bylines) == articleCount : author = bylines[i] else : author = None - if self.verbose > 2 : self.log( " title: %s" % title) - if self.verbose > 2 : self.log( " url: %s" % url) - if self.verbose > 2 : self.log( " author: %s" % author) - if self.verbose > 2 : self.log( "description: %s" % description) - - if not self.flatPeriodical : - self.feed = key - # Check for duplicates duplicateFound = False - if self.flatPeriodical and len(articles[self.feed]) > 1: - #print articles[self.feed] - for article in articles[self.feed] : + if len(articles[feed]) > 1: + #print articles[feed] + for article in articles[feed] : #print "comparing %s\n %s\n" % (url, article['url']) if url == article['url'] : duplicateFound = True @@ -280,23 +242,18 @@ class NYTimes(BasicNewsRecipe): #print if duplicateFound: - # Continue fetching, don't add this article - print " skipping duplicate %s" % article['url'] continue - if not articles.has_key(self.feed): - if self.verbose > 2 : self.log( "adding %s to articles[]" % self.feed) - articles[self.feed] = [] - if self.verbose > 2 : self.log( " adding: %s to articles[%s]\n" % (title, self.feed)) - articles[self.feed].append( + if not articles.has_key(feed): + articles[feed] = [] + articles[feed].append( dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) - self.ans = self.sort_index_by(self.ans, {'Top Stories':-1}) - self.ans = [(key, articles[key]) for key in self.ans if articles.has_key(key)] - #sys.exit(1) - - return self.ans + ans = self.sort_index_by(ans, {'Top Stories':-1}) + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + + return ans def preprocess_html(self, soup): refresh = soup.find('meta', {'http-equiv':'refresh'}) @@ -307,12 +264,9 @@ class NYTimes(BasicNewsRecipe): return BeautifulSoup(raw.decode('cp1252', 'replace')) def postprocess_html(self,soup, True): - if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ") # Change class="kicker" to

kicker = soup.find(True, {'class':'kicker'}) if kicker is not None : - print "changing kicker to

" - print kicker h3Tag = Tag(soup, "h3") h3Tag.insert(0, kicker.contents[0]) kicker.replaceWith(h3Tag) @@ -345,13 +299,7 @@ class NYTimes(BasicNewsRecipe): tag = Tag(soup, "h3") tag.insert(0, masthead.contents[0]) soup.h1.replaceWith(tag) - ''' - # Change subheads to

- for subhead in soup.findAll(True, {'class':'bold'}) : - h3Tag = Tag(soup, "h3") - h3Tag.insert(0, subhead.contents[0]) - subhead.replaceWith(h3Tag) - ''' + # Change to for subhead in soup.findAll(True, {'class':'bold'}) : bTag = Tag(soup, "b") @@ -359,4 +307,3 @@ class NYTimes(BasicNewsRecipe): subhead.replaceWith(bTag) return soup -