Misc. fixes

This commit is contained in:
Kovid Goyal 2009-07-15 12:36:50 -06:00
parent eb625d37c3
commit 656c55debf
5 changed files with 84 additions and 137 deletions

View File

@ -24,7 +24,7 @@ class ANDROID(USBMS):
]
PRODUCT_ID = [0x0c02]
BCD = [0x100]
EBOOK_DIR_MAIN = 'wordplayer/calibre'
EBOOK_DIR_MAIN = 'wordplayer/calibretransfer'
VENDOR_NAME = 'HTC'
WINDOWS_MAIN_MEM = 'ANDROID_PHONE'

View File

@ -3,12 +3,12 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
@ -30,9 +30,9 @@ def detect(aBuf):
# Added by Kovid
ENCODING_PATS = [
re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>',
re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>',
re.IGNORECASE),
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>',
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>',
re.IGNORECASE)
]
ENTITY_PATTERN = re.compile(r'&(\S+?);')
@ -51,7 +51,7 @@ def substitute_entites(raw):
_CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
def force_encoding(raw, verbose):
from calibre.constants import preferred_encoding
@ -70,19 +70,19 @@ def force_encoding(raw, verbose):
if encoding == 'ascii':
encoding = 'utf-8'
return encoding
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
resolve_entities=False):
'''
Force conversion of byte string to unicode. Tries to look for XML/HTML
Force conversion of byte string to unicode. Tries to look for XML/HTML
encoding declaration first, if not found uses the chardet library and
prints a warning if detection confidence is < 100%
@return: (unicode, encoding used)
@return: (unicode, encoding used)
'''
encoding = None
if not raw:
return u'', encoding
return u'', encoding
if not isinstance(raw, unicode):
if raw.startswith('\xff\xfe'):
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
@ -103,10 +103,10 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
except LookupError:
encoding = 'utf-8'
raw = raw.decode(encoding, 'replace')
if strip_encoding_pats:
raw = strip_encoding_declarations(raw)
if resolve_entities:
raw = substitute_entites(raw)
return raw, encoding
return raw, encoding

View File

@ -163,7 +163,7 @@ def fetch_scheduled_recipe(recipe, script):
OptionRecommendation.HIGH))
lf = load_defaults('look_and_feel')
if lf.get('base_font_size', 0.0) != 0.0:
recs.append(('base_font_size', ps['base_font_size'],
recs.append(('base_font_size', lf['base_font_size'],
OptionRecommendation.HIGH))
args = [script, pt.name, recs]

View File

@ -1015,7 +1015,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
def books_in_series_of(self, index, index_is_id=False):
'''
Return an ordered list of all books in the series that the book indetified by index belongs to.
Return an ordered list of all books in the series that the book identified by index belongs to.
If the book does not belong to a series return an empty list. The list contains book ids.
'''
series_id = self.series_id(index, index_is_id=index_is_id)

View File

@ -16,32 +16,62 @@ class NYTimes(BasicNewsRecipe):
__author__ = 'GRiker'
language = _('English')
description = 'Top Stories from the New York Times'
#max_articles_per_feed = 3
# List of sections typically included in Top Stories. Use a keyword from the
# right column in the excludeSectionKeywords[] list to skip downloading that section
sections = {
'arts' : 'Arts',
'business' : 'Business',
'diningwine' : 'Dining & Wine',
'editorials' : 'Editorials',
'health' : 'Health',
'magazine' : 'Magazine',
'mediaadvertising' : 'Media & Advertising',
'newyorkregion' : 'New York/Region',
'oped' : 'Op-Ed',
'politics' : 'Politics',
'science' : 'Science',
'sports' : 'Sports',
'technology' : 'Technology',
'topstories' : 'Top Stories',
'travel' : 'Travel',
'us' : 'U.S.',
'world' : 'World'
}
# By default, no sections are skipped.
excludeSectionKeywords = []
# Add section keywords from the right column above to skip that section
# For example, to skip sections containing the word 'Sports' or 'Dining', use:
# excludeSectionKeywords = ['Sports', 'Dining']
# Fetch only Business and Technology
#excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
# Fetch only Top Stories
#excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
# The maximum number of articles that will be downloaded
max_articles_per_feed = 50
timefmt = ''
needs_subscription = True
remove_tags_after = dict(attrs={'id':['comments']})
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink',
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink',
'clearfix', 'nextArticleLink clearfix','inlineSearchControl',
'columnGroup','entry-meta','entry-response module','jumpLink','nav',
'columnGroup advertisementColumnGroup', 'kicker entry-category']}),
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive',
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive',
'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login',
'blog-header','searchForm','NYTLogo','insideNYTimes','adxToolSponsor',
'adxLeaderboard']),
dict(name=['script', 'noscript', 'style','hr'])]
encoding = 'cp1252'
no_stylesheets = True
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
extra_css = '.headline {text-align:left;}\n\
.byline {font:monospace; margin-bottom:0px;}\n\
.source {align:left;}\n\
.credit {align:right;}\n'
flatPeriodical = True
feed = None
ans = []
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
@ -54,14 +84,8 @@ class NYTimes(BasicNewsRecipe):
def index_to_soup(self, url_or_raw, raw=False):
'''
Convenience method that takes an URL to the index page and returns
a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
of it.
This is an OVERRIDE of the method provided in news.py to solve an encoding problem
with NYTimes index pages which seem to be encoded in a wonderful blend
`url_or_raw`: Either a URL or the downloaded index page as a string
OVERRIDE of class method
deals with various page encodings between index and articles
'''
def get_the_soup(docEncoding, url_or_raw, raw=False) :
if re.match(r'\w+://', url_or_raw):
@ -88,58 +112,18 @@ class NYTimes(BasicNewsRecipe):
if docEncoding == '' :
docEncoding = self.encoding
if self.verbose :
self.log( " document encoding: '%s'" % docEncoding)
if docEncoding != self.encoding :
soup = get_the_soup(docEncoding, url_or_raw)
soup = get_the_soup(docEncoding, url_or_raw)
return soup
def parse_index(self):
articles = {}
ans = []
if self.flatPeriodical :
self.feed = key = 'All Top Stories'
articles[key] = []
self.ans.append(key)
else :
key = None
'''
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=False)).strip()
'''
sections = {
'arts' : 'Arts',
'business' : 'Business',
'editorials' : 'Editorials',
'health' : 'Health',
'magazine' : 'Magazine',
'mediaadvertising' : 'Media & Advertising',
'newyorkregion' : 'New York/Region',
'oped' : 'Op-Ed',
'politics' : 'Politics',
'science' : 'Science',
'sports' : 'Sports',
'technology' : 'Technology',
'topstories' : 'Top Stories',
'travel' : 'Travel',
'us' : 'U.S.',
'world' : 'World'
}
'''
excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media',
'New York','Op-Ed','Politics','Science','Sports','Technology',
'Top Stories','Travel','U.S.','World']
'''
excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media',
'New York','Politics','Science','Sports','Technology',
'Top Stories','Travel','U.S.','World']
#excludeSectionKeywords = []
feed = key = 'All Top Stories'
articles[key] = []
ans.append(key)
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
@ -152,35 +136,25 @@ class NYTimes(BasicNewsRecipe):
while True :
table = table.find('table')
if table.find(text=re.compile('top stories start')) :
if self.verbose > 2 : self.log( "*********** dropping one level deeper **************")
previousTable = table
continue
else :
if self.verbose > 2 : self.log( "found table with top stories")
table = previousTable
if self.verbose > 2 : self.log( "lowest table containing 'top stories start:\n%s" % table)
break
# There are multiple subtables, find the one containing the stories
for block in table.findAll('table') :
if block.find(text=re.compile('top stories start')) :
if self.verbose > 2 : self.log( "found subtable with top stories")
table = block
if self.verbose > 2 : self.log( "lowest subtable containing 'top stories start:\n%s" % table)
break
else :
if self.verbose > 2 : self.log( "trying next subtable")
continue
# Again there are multiple subtables, find the one containing the stories
for storyblock in table.findAll('table') :
if storyblock.find(text=re.compile('top stories start')) :
if self.verbose > 2 : self.log( "found subsubtable with top stories\n" )
# table = storyblock
if self.verbose > 2 : self.log( "\nlowest subsubtable containing 'top stories start:\n%s" % storyblock)
break
else :
if self.verbose > 2 : self.log( "trying next subsubtable")
continue
skipThisSection = False
@ -192,7 +166,6 @@ class NYTimes(BasicNewsRecipe):
sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
'times new roman,times, sans serif',
'times new roman, times, sans serif']})
if self.verbose > 2 : self.log( "----------- new tr ----------------")
section = None
bylines = []
descriptions = []
@ -205,26 +178,20 @@ class NYTimes(BasicNewsRecipe):
if ('Comment' in str(i.__class__)) :
if 'start(name=' in i :
section = i[i.find('=')+1:-2]
if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section])
if not sections.has_key(section) :
self.log( "Unrecognized section id: %s, skipping" % section )
if not self.sections.has_key(section) :
skipThisSection = True
break
# Check for excluded section
if len(excludeSectionKeywords):
key = sections[section]
excluded = re.compile('|'.join(excludeSectionKeywords))
if len(self.excludeSectionKeywords):
key = self.sections[section]
excluded = re.compile('|'.join(self.excludeSectionKeywords))
if excluded.search(key) or articles.has_key(key):
if self.verbose > 2 : self.log("Skipping section %s" % key)
if self.verbose : self.log("Skipping section %s" % key)
skipThisSection = True
break
if not self.flatPeriodical :
articles[key] = []
self.ans.append(key)
# Get the bylines and descriptions
if not skipThisSection :
for (x,i) in enumerate(sectionblock.contents) :
@ -248,31 +215,26 @@ class NYTimes(BasicNewsRecipe):
#continue
url = re.sub(r'\?.*', '', a['href'])
url += '?pagewanted=all'
title = self.tag_to_string(a, use_alt=True)
if self.flatPeriodical :
# prepend the section name
title = sections[section] + " &middot; " + title
# prepend the section name
title = self.sections[section] + " &middot; " + title
if not isinstance(title, unicode):
title = title.decode('utf-8', 'replace')
description = descriptions[i]
if len(bylines) == articleCount :
author = bylines[i]
else :
author = None
if self.verbose > 2 : self.log( " title: %s" % title)
if self.verbose > 2 : self.log( " url: %s" % url)
if self.verbose > 2 : self.log( " author: %s" % author)
if self.verbose > 2 : self.log( "description: %s" % description)
if not self.flatPeriodical :
self.feed = key
# Check for duplicates
duplicateFound = False
if self.flatPeriodical and len(articles[self.feed]) > 1:
#print articles[self.feed]
for article in articles[self.feed] :
if len(articles[feed]) > 1:
#print articles[feed]
for article in articles[feed] :
#print "comparing %s\n %s\n" % (url, article['url'])
if url == article['url'] :
duplicateFound = True
@ -280,23 +242,18 @@ class NYTimes(BasicNewsRecipe):
#print
if duplicateFound:
# Continue fetching, don't add this article
print " skipping duplicate %s" % article['url']
continue
if not articles.has_key(self.feed):
if self.verbose > 2 : self.log( "adding %s to articles[]" % self.feed)
articles[self.feed] = []
if self.verbose > 2 : self.log( " adding: %s to articles[%s]\n" % (title, self.feed))
articles[self.feed].append(
if not articles.has_key(feed):
articles[feed] = []
articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description, author=author, content=''))
self.ans = self.sort_index_by(self.ans, {'Top Stories':-1})
self.ans = [(key, articles[key]) for key in self.ans if articles.has_key(key)]
#sys.exit(1)
return self.ans
ans = self.sort_index_by(ans, {'Top Stories':-1})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans
def preprocess_html(self, soup):
refresh = soup.find('meta', {'http-equiv':'refresh'})
@ -307,12 +264,9 @@ class NYTimes(BasicNewsRecipe):
return BeautifulSoup(raw.decode('cp1252', 'replace'))
def postprocess_html(self,soup, True):
if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ")
# Change class="kicker" to <h3>
kicker = soup.find(True, {'class':'kicker'})
if kicker is not None :
print "changing kicker to <h3>"
print kicker
h3Tag = Tag(soup, "h3")
h3Tag.insert(0, kicker.contents[0])
kicker.replaceWith(h3Tag)
@ -345,13 +299,7 @@ class NYTimes(BasicNewsRecipe):
tag = Tag(soup, "h3")
tag.insert(0, masthead.contents[0])
soup.h1.replaceWith(tag)
'''
# Change subheads to <h3>
for subhead in soup.findAll(True, {'class':'bold'}) :
h3Tag = Tag(soup, "h3")
h3Tag.insert(0, subhead.contents[0])
subhead.replaceWith(h3Tag)
'''
# Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) :
bTag = Tag(soup, "b")
@ -359,4 +307,3 @@ class NYTimes(BasicNewsRecipe):
subhead.replaceWith(bTag)
return soup