Misc. fixes

This commit is contained in:
Kovid Goyal 2009-07-15 12:36:50 -06:00
parent eb625d37c3
commit 656c55debf
5 changed files with 84 additions and 137 deletions

View File

@ -24,7 +24,7 @@ class ANDROID(USBMS):
] ]
PRODUCT_ID = [0x0c02] PRODUCT_ID = [0x0c02]
BCD = [0x100] BCD = [0x100]
EBOOK_DIR_MAIN = 'wordplayer/calibre' EBOOK_DIR_MAIN = 'wordplayer/calibretransfer'
VENDOR_NAME = 'HTC' VENDOR_NAME = 'HTC'
WINDOWS_MAIN_MEM = 'ANDROID_PHONE' WINDOWS_MAIN_MEM = 'ANDROID_PHONE'

View File

@ -163,7 +163,7 @@ def fetch_scheduled_recipe(recipe, script):
OptionRecommendation.HIGH)) OptionRecommendation.HIGH))
lf = load_defaults('look_and_feel') lf = load_defaults('look_and_feel')
if lf.get('base_font_size', 0.0) != 0.0: if lf.get('base_font_size', 0.0) != 0.0:
recs.append(('base_font_size', ps['base_font_size'], recs.append(('base_font_size', lf['base_font_size'],
OptionRecommendation.HIGH)) OptionRecommendation.HIGH))
args = [script, pt.name, recs] args = [script, pt.name, recs]

View File

@ -1015,7 +1015,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
def books_in_series_of(self, index, index_is_id=False): def books_in_series_of(self, index, index_is_id=False):
''' '''
Return an ordered list of all books in the series that the book indetified by index belongs to. Return an ordered list of all books in the series that the book identified by index belongs to.
If the book does not belong to a series return an empty list. The list contains book ids. If the book does not belong to a series return an empty list. The list contains book ids.
''' '''
series_id = self.series_id(index, index_is_id=index_is_id) series_id = self.series_id(index, index_is_id=index_is_id)

View File

@ -16,7 +16,43 @@ class NYTimes(BasicNewsRecipe):
__author__ = 'GRiker' __author__ = 'GRiker'
language = _('English') language = _('English')
description = 'Top Stories from the New York Times' description = 'Top Stories from the New York Times'
#max_articles_per_feed = 3
# List of sections typically included in Top Stories. Use a keyword from the
# right column in the excludeSectionKeywords[] list to skip downloading that section
sections = {
'arts' : 'Arts',
'business' : 'Business',
'diningwine' : 'Dining & Wine',
'editorials' : 'Editorials',
'health' : 'Health',
'magazine' : 'Magazine',
'mediaadvertising' : 'Media & Advertising',
'newyorkregion' : 'New York/Region',
'oped' : 'Op-Ed',
'politics' : 'Politics',
'science' : 'Science',
'sports' : 'Sports',
'technology' : 'Technology',
'topstories' : 'Top Stories',
'travel' : 'Travel',
'us' : 'U.S.',
'world' : 'World'
}
# By default, no sections are skipped.
excludeSectionKeywords = []
# Add section keywords from the right column above to skip that section
# For example, to skip sections containing the word 'Sports' or 'Dining', use:
# excludeSectionKeywords = ['Sports', 'Dining']
# Fetch only Business and Technology
#excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
# Fetch only Top Stories
#excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
# The maximum number of articles that will be downloaded
max_articles_per_feed = 50
timefmt = '' timefmt = ''
needs_subscription = True needs_subscription = True
remove_tags_after = dict(attrs={'id':['comments']}) remove_tags_after = dict(attrs={'id':['comments']})
@ -31,17 +67,11 @@ class NYTimes(BasicNewsRecipe):
dict(name=['script', 'noscript', 'style','hr'])] dict(name=['script', 'noscript', 'style','hr'])]
encoding = 'cp1252' encoding = 'cp1252'
no_stylesheets = True no_stylesheets = True
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
extra_css = '.headline {text-align:left;}\n\ extra_css = '.headline {text-align:left;}\n\
.byline {font:monospace; margin-bottom:0px;}\n\ .byline {font:monospace; margin-bottom:0px;}\n\
.source {align:left;}\n\ .source {align:left;}\n\
.credit {align:right;}\n' .credit {align:right;}\n'
flatPeriodical = True
feed = None
ans = []
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
@ -54,14 +84,8 @@ class NYTimes(BasicNewsRecipe):
def index_to_soup(self, url_or_raw, raw=False): def index_to_soup(self, url_or_raw, raw=False):
''' '''
Convenience method that takes an URL to the index page and returns OVERRIDE of class method
a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ deals with various page encodings between index and articles
of it.
This is an OVERRIDE of the method provided in news.py to solve an encoding problem
with NYTimes index pages which seem to be encoded in a wonderful blend
`url_or_raw`: Either a URL or the downloaded index page as a string
''' '''
def get_the_soup(docEncoding, url_or_raw, raw=False) : def get_the_soup(docEncoding, url_or_raw, raw=False) :
if re.match(r'\w+://', url_or_raw): if re.match(r'\w+://', url_or_raw):
@ -88,8 +112,6 @@ class NYTimes(BasicNewsRecipe):
if docEncoding == '' : if docEncoding == '' :
docEncoding = self.encoding docEncoding = self.encoding
if self.verbose :
self.log( " document encoding: '%s'" % docEncoding)
if docEncoding != self.encoding : if docEncoding != self.encoding :
soup = get_the_soup(docEncoding, url_or_raw) soup = get_the_soup(docEncoding, url_or_raw)
@ -97,49 +119,11 @@ class NYTimes(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
articles = {} articles = {}
ans = []
if self.flatPeriodical : feed = key = 'All Top Stories'
self.feed = key = 'All Top Stories'
articles[key] = [] articles[key] = []
self.ans.append(key) ans.append(key)
else :
key = None
'''
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=False)).strip()
'''
sections = {
'arts' : 'Arts',
'business' : 'Business',
'editorials' : 'Editorials',
'health' : 'Health',
'magazine' : 'Magazine',
'mediaadvertising' : 'Media & Advertising',
'newyorkregion' : 'New York/Region',
'oped' : 'Op-Ed',
'politics' : 'Politics',
'science' : 'Science',
'sports' : 'Sports',
'technology' : 'Technology',
'topstories' : 'Top Stories',
'travel' : 'Travel',
'us' : 'U.S.',
'world' : 'World'
}
'''
excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media',
'New York','Op-Ed','Politics','Science','Sports','Technology',
'Top Stories','Travel','U.S.','World']
'''
excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media',
'New York','Politics','Science','Sports','Technology',
'Top Stories','Travel','U.S.','World']
#excludeSectionKeywords = []
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
@ -152,35 +136,25 @@ class NYTimes(BasicNewsRecipe):
while True : while True :
table = table.find('table') table = table.find('table')
if table.find(text=re.compile('top stories start')) : if table.find(text=re.compile('top stories start')) :
if self.verbose > 2 : self.log( "*********** dropping one level deeper **************")
previousTable = table previousTable = table
continue continue
else : else :
if self.verbose > 2 : self.log( "found table with top stories")
table = previousTable table = previousTable
if self.verbose > 2 : self.log( "lowest table containing 'top stories start:\n%s" % table)
break break
# There are multiple subtables, find the one containing the stories # There are multiple subtables, find the one containing the stories
for block in table.findAll('table') : for block in table.findAll('table') :
if block.find(text=re.compile('top stories start')) : if block.find(text=re.compile('top stories start')) :
if self.verbose > 2 : self.log( "found subtable with top stories")
table = block table = block
if self.verbose > 2 : self.log( "lowest subtable containing 'top stories start:\n%s" % table)
break break
else : else :
if self.verbose > 2 : self.log( "trying next subtable")
continue continue
# Again there are multiple subtables, find the one containing the stories # Again there are multiple subtables, find the one containing the stories
for storyblock in table.findAll('table') : for storyblock in table.findAll('table') :
if storyblock.find(text=re.compile('top stories start')) : if storyblock.find(text=re.compile('top stories start')) :
if self.verbose > 2 : self.log( "found subsubtable with top stories\n" )
# table = storyblock
if self.verbose > 2 : self.log( "\nlowest subsubtable containing 'top stories start:\n%s" % storyblock)
break break
else : else :
if self.verbose > 2 : self.log( "trying next subsubtable")
continue continue
skipThisSection = False skipThisSection = False
@ -192,7 +166,6 @@ class NYTimes(BasicNewsRecipe):
sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif', sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
'times new roman,times, sans serif', 'times new roman,times, sans serif',
'times new roman, times, sans serif']}) 'times new roman, times, sans serif']})
if self.verbose > 2 : self.log( "----------- new tr ----------------")
section = None section = None
bylines = [] bylines = []
descriptions = [] descriptions = []
@ -205,26 +178,20 @@ class NYTimes(BasicNewsRecipe):
if ('Comment' in str(i.__class__)) : if ('Comment' in str(i.__class__)) :
if 'start(name=' in i : if 'start(name=' in i :
section = i[i.find('=')+1:-2] section = i[i.find('=')+1:-2]
if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section])
if not sections.has_key(section) : if not self.sections.has_key(section) :
self.log( "Unrecognized section id: %s, skipping" % section )
skipThisSection = True skipThisSection = True
break break
# Check for excluded section # Check for excluded section
if len(excludeSectionKeywords): if len(self.excludeSectionKeywords):
key = sections[section] key = self.sections[section]
excluded = re.compile('|'.join(excludeSectionKeywords)) excluded = re.compile('|'.join(self.excludeSectionKeywords))
if excluded.search(key) or articles.has_key(key): if excluded.search(key) or articles.has_key(key):
if self.verbose > 2 : self.log("Skipping section %s" % key) if self.verbose : self.log("Skipping section %s" % key)
skipThisSection = True skipThisSection = True
break break
if not self.flatPeriodical :
articles[key] = []
self.ans.append(key)
# Get the bylines and descriptions # Get the bylines and descriptions
if not skipThisSection : if not skipThisSection :
for (x,i) in enumerate(sectionblock.contents) : for (x,i) in enumerate(sectionblock.contents) :
@ -248,31 +215,26 @@ class NYTimes(BasicNewsRecipe):
#continue #continue
url = re.sub(r'\?.*', '', a['href']) url = re.sub(r'\?.*', '', a['href'])
url += '?pagewanted=all' url += '?pagewanted=all'
title = self.tag_to_string(a, use_alt=True) title = self.tag_to_string(a, use_alt=True)
if self.flatPeriodical :
# prepend the section name # prepend the section name
title = sections[section] + " &middot; " + title title = self.sections[section] + " &middot; " + title
if not isinstance(title, unicode): if not isinstance(title, unicode):
title = title.decode('utf-8', 'replace') title = title.decode('utf-8', 'replace')
description = descriptions[i] description = descriptions[i]
if len(bylines) == articleCount : if len(bylines) == articleCount :
author = bylines[i] author = bylines[i]
else : else :
author = None author = None
if self.verbose > 2 : self.log( " title: %s" % title)
if self.verbose > 2 : self.log( " url: %s" % url)
if self.verbose > 2 : self.log( " author: %s" % author)
if self.verbose > 2 : self.log( "description: %s" % description)
if not self.flatPeriodical :
self.feed = key
# Check for duplicates # Check for duplicates
duplicateFound = False duplicateFound = False
if self.flatPeriodical and len(articles[self.feed]) > 1: if len(articles[feed]) > 1:
#print articles[self.feed] #print articles[feed]
for article in articles[self.feed] : for article in articles[feed] :
#print "comparing %s\n %s\n" % (url, article['url']) #print "comparing %s\n %s\n" % (url, article['url'])
if url == article['url'] : if url == article['url'] :
duplicateFound = True duplicateFound = True
@ -280,23 +242,18 @@ class NYTimes(BasicNewsRecipe):
#print #print
if duplicateFound: if duplicateFound:
# Continue fetching, don't add this article
print " skipping duplicate %s" % article['url']
continue continue
if not articles.has_key(self.feed): if not articles.has_key(feed):
if self.verbose > 2 : self.log( "adding %s to articles[]" % self.feed) articles[feed] = []
articles[self.feed] = [] articles[feed].append(
if self.verbose > 2 : self.log( " adding: %s to articles[%s]\n" % (title, self.feed))
articles[self.feed].append(
dict(title=title, url=url, date=pubdate, dict(title=title, url=url, date=pubdate,
description=description, author=author, content='')) description=description, author=author, content=''))
self.ans = self.sort_index_by(self.ans, {'Top Stories':-1}) ans = self.sort_index_by(ans, {'Top Stories':-1})
self.ans = [(key, articles[key]) for key in self.ans if articles.has_key(key)] ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
#sys.exit(1)
return self.ans return ans
def preprocess_html(self, soup): def preprocess_html(self, soup):
refresh = soup.find('meta', {'http-equiv':'refresh'}) refresh = soup.find('meta', {'http-equiv':'refresh'})
@ -307,12 +264,9 @@ class NYTimes(BasicNewsRecipe):
return BeautifulSoup(raw.decode('cp1252', 'replace')) return BeautifulSoup(raw.decode('cp1252', 'replace'))
def postprocess_html(self,soup, True): def postprocess_html(self,soup, True):
if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ")
# Change class="kicker" to <h3> # Change class="kicker" to <h3>
kicker = soup.find(True, {'class':'kicker'}) kicker = soup.find(True, {'class':'kicker'})
if kicker is not None : if kicker is not None :
print "changing kicker to <h3>"
print kicker
h3Tag = Tag(soup, "h3") h3Tag = Tag(soup, "h3")
h3Tag.insert(0, kicker.contents[0]) h3Tag.insert(0, kicker.contents[0])
kicker.replaceWith(h3Tag) kicker.replaceWith(h3Tag)
@ -345,13 +299,7 @@ class NYTimes(BasicNewsRecipe):
tag = Tag(soup, "h3") tag = Tag(soup, "h3")
tag.insert(0, masthead.contents[0]) tag.insert(0, masthead.contents[0])
soup.h1.replaceWith(tag) soup.h1.replaceWith(tag)
'''
# Change subheads to <h3>
for subhead in soup.findAll(True, {'class':'bold'}) :
h3Tag = Tag(soup, "h3")
h3Tag.insert(0, subhead.contents[0])
subhead.replaceWith(h3Tag)
'''
# Change <span class="bold"> to <b> # Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) : for subhead in soup.findAll(True, {'class':'bold'}) :
bTag = Tag(soup, "b") bTag = Tag(soup, "b")
@ -359,4 +307,3 @@ class NYTimes(BasicNewsRecipe):
subhead.replaceWith(bTag) subhead.replaceWith(bTag)
return soup return soup