Sync to trunk.

This commit is contained in:
John Schember 2009-07-12 12:48:13 -04:00
commit 97c1b8a0c3
7 changed files with 341 additions and 108 deletions

View File

@ -764,7 +764,25 @@ class Manifest(object):
# Convert to Unicode and normalize line endings # Convert to Unicode and normalize line endings
data = self.oeb.decode(data) data = self.oeb.decode(data)
data = self.oeb.html_preprocessor(data) data = self.oeb.html_preprocessor(data)
orig_data = data
# Remove DOCTYPE declaration as it messes up parsing
# Inparticular it causes tostring to insert xmlns
# declarations, which messes up the coercing logic
idx = data.find('<html')
if idx > -1:
pre = data[:idx]
data = data[idx:]
if '<!DOCTYPE' in pre:
user_entities = {}
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
val = match.group(2)
if val.startswith('"') and val.endswith('"'):
val = val[1:-1]
user_entities[match.group(1)] = val
if user_entities:
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
data = pat.sub(lambda m:user_entities[m.group(1)], data)
# Try with more & more drastic measures to parse # Try with more & more drastic measures to parse
def first_pass(data): def first_pass(data):
try: try:

View File

@ -282,8 +282,10 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.initialize_combos() self.initialize_combos()
si = self.db.series_index(row)
self.series_index.setValue(self.db.series_index(row)) if si is None:
si = 1.0
self.series_index.setValue(si)
QObject.connect(self.series, SIGNAL('currentIndexChanged(int)'), self.enable_series_index) QObject.connect(self.series, SIGNAL('currentIndexChanged(int)'), self.enable_series_index)
QObject.connect(self.series, SIGNAL('editTextChanged(QString)'), self.enable_series_index) QObject.connect(self.series, SIGNAL('editTextChanged(QString)'), self.enable_series_index)
@ -305,6 +307,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
def deduce_author_sort(self): def deduce_author_sort(self):
au = unicode(self.authors.text()) au = unicode(self.authors.text())
au = re.sub(r'\s+et al\.$', '', au)
authors = string_to_authors(au) authors = string_to_authors(au)
self.author_sort.setText(authors_to_sort_string(authors)) self.author_sort.setText(authors_to_sort_string(authors))
@ -483,9 +486,17 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
def accept(self): def accept(self):
try:
if self.formats_changed: if self.formats_changed:
self.sync_formats() self.sync_formats()
title = qstring_to_unicode(self.title.text()) title = unicode(self.title.text())
except IOError, err:
if err.errno == 13: # Permission denied
fname = err.filename if err.filename else 'file'
return error_dialog(self, _('Permission denied'),
_('Could not open %s. Is it being used by another'
' program?')%fname, show=True)
raise
self.db.set_title(self.id, title, notify=False) self.db.set_title(self.id, title, notify=False)
au = unicode(self.authors.text()) au = unicode(self.authors.text())
if au: if au:

View File

@ -25,6 +25,8 @@ class Article(object):
entity_to_unicode, self.title) entity_to_unicode, self.title)
except: except:
pass pass
if not isinstance(self.title, unicode):
self.title = self.title.decode('utf-8', 'replace')
self.url = url self.url = url
self.author = author self.author = author
if author and not isinstance(author, unicode): if author and not isinstance(author, unicode):

View File

@ -980,7 +980,7 @@ class BasicNewsRecipe(Recipe):
def error_in_article_download(self, request, traceback): def error_in_article_download(self, request, traceback):
self.jobs_done += 1 self.jobs_done += 1
self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url)) self.log.error(_(u'Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
self.log.debug(traceback) self.log.debug(traceback)
self.log.debug('\n') self.log.debug('\n')
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title) self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)

View File

@ -15,7 +15,7 @@ recipe_modules = ['recipe_' + r for r in (
'demorgen_be', 'de_standaard', 'ap', 'barrons', 'chr_mon', 'cnn', 'faznet', 'demorgen_be', 'de_standaard', 'ap', 'barrons', 'chr_mon', 'cnn', 'faznet',
'jpost', 'jutarnji', 'nasa', 'reuters', 'spiegelde', 'wash_post', 'zeitde', 'jpost', 'jutarnji', 'nasa', 'reuters', 'spiegelde', 'wash_post', 'zeitde',
'blic', 'novosti', 'danas', 'vreme', 'times_online', 'the_scotsman', 'blic', 'novosti', 'danas', 'vreme', 'times_online', 'the_scotsman',
'nytimes_sub', 'security_watch', 'cyberpresse', 'st_petersburg_times', 'nytimes_sub', 'nytimes', 'security_watch', 'cyberpresse', 'st_petersburg_times',
'clarin', 'financial_times', 'heise', 'le_monde', 'harpers', 'science_aas', 'clarin', 'financial_times', 'heise', 'le_monde', 'harpers', 'science_aas',
'science_news', 'the_nation', 'lrb', 'harpers_full', 'liberation', 'science_news', 'the_nation', 'lrb', 'harpers_full', 'liberation',
'linux_magazine', 'telegraph_uk', 'utne', 'sciencedaily', 'forbes', 'linux_magazine', 'telegraph_uk', 'utne', 'sciencedaily', 'forbes',

View File

@ -0,0 +1,71 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
class CraigsList(BasicNewsRecipe):
title = u'craigslist - Best Of'
oldest_article = 365
max_articles_per_feed = 100
language = _('English')
__author__ = 'kiodane'
feeds = [(u'Best of craigslist',
u'http://www.craigslist.org/about/best/all/index.rss'), (u'Ann Arbor',
u'http://www.craigslist.org/about/best/aaa/index.rss'), (u'Asheville',
u'http://www.craigslist.org/about/best/ash/index.rss'), (u'Austin',
u'http://www.craigslist.org/about/best/aus/index.rss'), (u'Baltimore',
u'http://www.craigslist.org/about/best/bal/index.rss'), (u'Birmingham',
u'http://www.craigslist.org/about/best/bhm/index.rss'), (u'Boston',
u'http://www.craigslist.org/about/best/bos/index.rss'), (u'Vermont',
u'http://www.craigslist.org/about/best/brl/index.rss'), (u'Columbia',
u'http://www.craigslist.org/about/best/cae/index.rss'), (u'Charlotte',
u'http://www.craigslist.org/about/best/cha/index.rss'), (u'Chico',
u'http://www.craigslist.org/about/best/chc/index.rss'), (u'Chicago',
u'http://www.craigslist.org/about/best/chi/index.rss'), (u'Charleston',
u'http://www.craigslist.org/about/best/chs/index.rss'), (u'Cleveland',
u'http://www.craigslist.org/about/best/cle/index.rss'), (u'Calgary',
u'http://www.craigslist.org/about/best/clg/index.rss'),
(u'Colorado Springs', u'http://www.craigslist.org/about/best/cos/index.rss'),
(u'Dallas', u'http://www.craigslist.org/about/best/dal/index.rss'),
(u'Denver', u'http://www.craigslist.org/about/best/den/index.rss'),
(u'Detroit Metro', u'http://www.craigslist.org/about/best/det/index.rss'),
(u'Des Moines', u'http://www.craigslist.org/about/best/dsm/index.rss'),
(u'Eau Claire', u'http://www.craigslist.org/about/best/eau/index.rss'),
(u'Grand Rapids', u'http://www.craigslist.org/about/best/grr/index.rss'),
(u'Hawaii', u'http://www.craigslist.org/about/best/hnl/index.rss'),
(u'Jacksonville', u'http://www.craigslist.org/about/best/jax/index.rss'),
(u'Knoxville', u'http://www.craigslist.org/about/best/knx/index.rss'),
(u'Kansas City', u'http://www.craigslist.org/about/best/ksc/index.rss'),
(u'South Florida', u'http://www.craigslist.org/about/best/mia/index.rss'),
(u'Minneapolis', u'http://www.craigslist.org/about/best/min/index.rss'),
(u'Maine', u'http://www.craigslist.org/about/best/mne/index.rss'),
(u'Montreal', u'http://www.craigslist.org/about/best/mon/index.rss'),
(u'Nashville', u'http://www.craigslist.org/about/best/nsh/index.rss'),
(u'New York', u'http://www.craigslist.org/about/best/nyc/index.rss'),
(u'Orange County', u'http://www.craigslist.org/about/best/orc/index.rss'),
(u'Portland', u'http://www.craigslist.org/about/best/pdx/index.rss'),
(u'Phoenix', u'http://www.craigslist.org/about/best/phx/index.rss'),
(u'Pittsburgh', u'http://www.craigslist.org/about/best/pit/index.rss'),
(u'Rhode Island', u'http://www.craigslist.org/about/best/prv/index.rss'),
(u'Raleigh', u'http://www.craigslist.org/about/best/ral/index.rss'),
(u'Rochester', u'http://www.craigslist.org/about/best/rcs/index.rss'),
(u'San Antonio', u'http://www.craigslist.org/about/best/sat/index.rss'),
(u'Santa Barbara', u'http://www.craigslist.org/about/best/sba/index.rss'),
(u'San Diego', u'http://www.craigslist.org/about/best/sdo/index.rss'),
(u'Seattle-Tacoma', u'http://www.craigslist.org/about/best/sea/index.rss'),
(u'Sf Bay Area', u'http://www.craigslist.org/about/best/sfo/index.rss'),
(u'Salt Lake City',
u'http://www.craigslist.org/about/best/slc/index.rss'), (u'Spokane',
u'http://www.craigslist.org/about/best/spk/index.rss'), (u'St Louis',
u'http://www.craigslist.org/about/best/stl/index.rss'), (u'Sydney',
u'http://www.craigslist.org/about/best/syd/index.rss'), (u'Toronto',
u'http://www.craigslist.org/about/best/tor/index.rss'), (u'Vancouver BC',
u'http://www.craigslist.org/about/best/van/index.rss'), (u'Washington DC',
u'http://www.craigslist.org/about/best/wdc/index.rss')]

View File

@ -1,110 +1,241 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
mobile.nytimes.com nytimes.com
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from lxml import html from calibre.ebooks.BeautifulSoup import Tag
class NYTimesMobile(BasicNewsRecipe): class NYTimes(BasicNewsRecipe):
title = 'The New York Times' title = 'NYTimes Top Stories'
__author__ = 'Kovid Goyal' __author__ = 'Greg Riker'
language = _('English') language = _('English')
description = 'Daily news from the New York Times (mobile version)' description = 'Top Stories from the New York Times'
timefmt = ' [%a, %d %b, %Y]' #max_articles_per_feed = 3
multithreaded_fetch = True timefmt = ''
max_articles_per_feed = 15 needs_subscription = False
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', 'clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
dict(name=['script', 'noscript', 'style'])]
encoding = 'cp1252'
no_stylesheets = True no_stylesheets = True
extra_css = ''' #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
.h1 { font-size: x-large; font-weight: bold; font-family: sans-serif; text-align: left } extra_css = '.headline {text-align:left;}\n\
.h2 { font-size: large; font-weight: bold } .byline {font:monospace; margin-bottom:0px;}\n\
.credit { font-size: small } .source {align:left;}\n\
.aut { font-weight: bold } .credit {align:right;}\n'
.bodycontent { font-family: serif }
'''
remove_tags = [
dict(name='div', attrs={'class':['banner center', 'greyBackBlackTop', 'c bB']}),
dict(name='a', href='/main')
]
remove_tags_after = [
dict(name='a', attrs={'name': 'bottom'})
]
def image_url_processor(self, baseurl, url):
return re.sub(r'(&|&amp;).*', '', url)
def get_browser(self):
return BasicNewsRecipe.get_browser(mobile_browser=True)
def download(self, for_lrf=False):
if for_lrf:
self.max_articles_per_feed = 10
return BasicNewsRecipe.download(self, for_lrf=for_lrf)
def process_section(self, href):
raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
articles = []
while True:
root = html.fromstring(raw)
for art in self.find_articles(root):
append = True
for x in articles:
if x['title'] == art['title']:
append = False
break
if append: articles.append(art)
more = root.xpath('//a[starts-with(@href, "section") and contains(text(), "MORE")]')
if not more:
break
href = more[0].get('href')
raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
return articles
def find_articles(self, root): flatPeriodical = True
for a in root.xpath('//a[@accesskey]'):
href = a.get('href')
if href.startswith('http://'):
url = href
else:
url = 'http://mobile.nytimes.com/article' + href[href.find('?'):]+'&single=1',
yield {
'title': a.text.strip(),
'date' : '',
'url' : url,
'description': '',
}
def parse_index(self): def parse_index(self):
raw = self.index_to_soup('http://mobile.nytimes.com', raw=True) soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
root = html.fromstring(raw)
feeds = [('Latest news', list(self.find_articles(root)))]
for a in root.xpath('//a[starts-with(@href, "section")]'): def feed_title(div):
title = a.text.replace('&raquo;', '').replace(u'\xbb', '').strip() return ''.join(div.findAll(text=True, recursive=False)).strip()
print 'Processing section:', title
articles = self.process_section(a.get('href'))
feeds.append((title, articles))
return feeds articles = {}
def postprocess_html(self, soup, first_fetch): ans = []
for img in soup.findAll('img', width=True): if self.flatPeriodical :
try: feed = key = 'All Top Stories'
width = int(img['width'].replace('px', '')) articles[key] = []
if width < 5: ans.append(key)
img.extract() else :
key = None
sections = { 'topstories' : 'Top Stories',
'world' : 'World',
'us' : 'U.S.',
'politics' : 'Politics',
'business' : 'Business',
'technology' : 'Technology',
'sports' : 'Sports',
'arts' : 'Arts',
'newyorkregion': 'New York/Region',
'travel' : 'Travel',
'editorials' : 'Editorials',
'oped' : 'Op-Ed'
}
#excludeSectionKeywords = ['World','U.S.', 'Politics','Business','Technology','Sports','Arts','New York','Travel', 'Editorials', 'Op-Ed']
excludeSectionKeywords = []
# Fetch the outer table
table = soup.find('table')
previousTable = table
contentTable = None
# Find the deepest table containing the stories
while True :
table = table.find('table')
if table.find(text=re.compile('top stories start')) :
if self.verbose > 2 : self.log( "*********** dropping one level deeper **************")
previousTable = table
continue continue
except: else :
pass if self.verbose > 2 : self.log( "found table with top stories")
del img['width'] table = previousTable
del img['height'] if self.verbose > 2 : self.log( "lowest table containing 'top stories start:\n%s" % table)
del img.parent['style'] break
# There are multiple subtables, find the one containing the stories
for block in table.findAll('table') :
if block.find(text=re.compile('top stories start')) :
if self.verbose > 2 : self.log( "found subtable with top stories")
table = block
if self.verbose > 2 : self.log( "lowest subtable containing 'top stories start:\n%s" % table)
break
else :
if self.verbose > 2 : self.log( "trying next subtable")
continue
# Again there are multiple subtables, find the one containing the stories
for storyblock in table.findAll('table') :
if storyblock.find(text=re.compile('top stories start')) :
if self.verbose > 2 : self.log( "found subsubtable with top stories\n" )
# table = storyblock
if self.verbose > 2 : self.log( "\nlowest subsubtable containing 'top stories start:\n%s" % storyblock)
break
else :
if self.verbose > 2 : self.log( "trying next subsubtable")
continue
skipThisSection = False
# Within this table are <font face="times new roman, times, san serif"> entries
for tr in storyblock.findAllNext('tr'):
if tr.find('span') is not None :
sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
'times new roman,times, sans serif',
'times new roman, times, sans serif']})
if self.verbose > 2 : self.log( "----------- new tr ----------------")
section = None
bylines = []
descriptions = []
pubdate = None
# Get the Section title
for (x,i) in enumerate(sectionblock.contents) :
skipThisSection = False
# Extract the section title
if ('Comment' in str(i.__class__)) :
if 'start(name=' in i :
section = i[i.find('=')+1:-2]
if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section])
# Check for excluded section
if len(excludeSectionKeywords):
key = sections[section]
excluded = re.compile('|'.join(excludeSectionKeywords))
if excluded.search(key) or articles.has_key(key):
if self.verbose > 2 : self.log("Skipping section %s" % key)
skipThisSection = True
break
if not self.flatPeriodical :
articles[key] = []
ans.append(key)
# Get the bylines and descriptions
if not skipThisSection :
for (x,i) in enumerate(sectionblock.contents) :
# Extract the bylines and descriptions
if (i.string is not None) and \
(i.string.strip() > "") and \
not ('Comment' in str(i.__class__)) :
contentString = i.strip().encode('utf-8')
if contentString[0:3] == 'By ' :
bylines.append(contentString)
else :
descriptions.append(contentString)
# Fetch the article titles and URLs
articleCount = len(sectionblock.findAll('span'))
for (i,span) in enumerate(sectionblock.findAll('span')) :
a = span.find('a', href=True)
#if not a:
#continue
url = re.sub(r'\?.*', '', a['href'])
url += '?pagewanted=all'
title = self.tag_to_string(a, use_alt=True)
if self.flatPeriodical :
# prepend the section name
title = sections[section] + " : " + title
if not isinstance(title, unicode):
title = title.decode('utf-8', 'replace')
description = descriptions[i]
if len(bylines) == articleCount :
author = bylines[i]
else :
author = None
if self.verbose > 2 : self.log( " title: %s" % title)
if self.verbose > 2 : self.log( " url: %s" % url)
if self.verbose > 2 : self.log( " author: %s" % author)
if self.verbose > 2 : self.log( "description: %s" % description)
if not self.flatPeriodical :
feed = key
if not articles.has_key(feed):
if self.verbose > 2 : self.log( "adding %s to articles[]" % feed)
articles[feed] = []
if self.verbose > 2 : self.log( " adding: %s to articles[%s]\n" % (title, feed))
articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description, author=author, content=''))
ans = self.sort_index_by(ans, {'Top Stories':-1})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
#sys.exit(1)
return ans
def postprocess_html(self,soup, True):
if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ")
# Change captions to italic -1
for caption in soup.findAll(True, {'class':'caption'}) :
emTag = Tag(soup, "em")
#emTag['class'] = "caption"
#emTag['font-size-adjust'] = "-1"
emTag.insert(0, caption.contents[0])
hrTag = Tag(soup, 'hr')
emTag.insert(1, hrTag)
caption.replaceWith(emTag)
# Change <nyt_headline> to <h2>
headline = soup.div.div.div.div.div.h1.nyt_headline
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, headline.contents[0])
soup.h1.replaceWith(tag)
return soup return soup
def postprocess_book(self, oeb, opts, log) :
log( " ********** recipe.postprocess_book ********** ")
log( list(oeb.toc) )
log( "oeb: %s" % oeb.toc)
log( "opts: %s" % opts.verbose)
for sections in oeb.toc :
log( "section:")
for articleTOC in sections:
log( " title: %s" % articleTOC.title)
log( " author: %s" % articleTOC.author)
log( "description: %s" % articleTOC.description)
log( " href: %s" % articleTOC.href)
log( " content: %s" % oeb.manifest.hrefs[articleTOC.href])
return