diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 215e5a65ce..d044be24b6 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -764,7 +764,25 @@ class Manifest(object): # Convert to Unicode and normalize line endings data = self.oeb.decode(data) data = self.oeb.html_preprocessor(data) - orig_data = data + + # Remove DOCTYPE declaration as it messes up parsing + # Inparticular it causes tostring to insert xmlns + # declarations, which messes up the coercing logic + idx = data.find(' -1: + pre = data[:idx] + data = data[idx:] + if ']+)', pre): + val = match.group(2) + if val.startswith('"') and val.endswith('"'): + val = val[1:-1] + user_entities[match.group(1)] = val + if user_entities: + pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) + data = pat.sub(lambda m:user_entities[m.group(1)], data) + # Try with more & more drastic measures to parse def first_pass(data): try: diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 37252f17cd..0c2211e5c7 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -282,8 +282,10 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.initialize_combos() - - self.series_index.setValue(self.db.series_index(row)) + si = self.db.series_index(row) + if si is None: + si = 1.0 + self.series_index.setValue(si) QObject.connect(self.series, SIGNAL('currentIndexChanged(int)'), self.enable_series_index) QObject.connect(self.series, SIGNAL('editTextChanged(QString)'), self.enable_series_index) @@ -305,6 +307,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): def deduce_author_sort(self): au = unicode(self.authors.text()) + au = re.sub(r'\s+et al\.$', '', au) authors = string_to_authors(au) self.author_sort.setText(authors_to_sort_string(authors)) @@ -483,9 +486,17 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): def accept(self): - if self.formats_changed: - self.sync_formats() - title = qstring_to_unicode(self.title.text()) + try: + if self.formats_changed: + self.sync_formats() + title = unicode(self.title.text()) + except IOError, err: + if err.errno == 13: # Permission denied + fname = err.filename if err.filename else 'file' + return error_dialog(self, _('Permission denied'), + _('Could not open %s. Is it being used by another' + ' program?')%fname, show=True) + raise self.db.set_title(self.id, title, notify=False) au = unicode(self.authors.text()) if au: diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index 14ca98f534..5bf1260df4 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -25,6 +25,8 @@ class Article(object): entity_to_unicode, self.title) except: pass + if not isinstance(self.title, unicode): + self.title = self.title.decode('utf-8', 'replace') self.url = url self.author = author if author and not isinstance(author, unicode): diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 6ca0f8318f..88367ac63e 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -980,7 +980,7 @@ class BasicNewsRecipe(Recipe): def error_in_article_download(self, request, traceback): self.jobs_done += 1 - self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url)) + self.log.error(_(u'Failed to download article: %s from %s\n')%(request.article.title, request.article.url)) self.log.debug(traceback) self.log.debug('\n') self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title) diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index 51f0000605..78d22fef00 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -15,7 +15,7 @@ recipe_modules = ['recipe_' + r for r in ( 'demorgen_be', 'de_standaard', 'ap', 'barrons', 'chr_mon', 'cnn', 'faznet', 'jpost', 'jutarnji', 'nasa', 'reuters', 'spiegelde', 'wash_post', 'zeitde', 'blic', 'novosti', 'danas', 'vreme', 'times_online', 'the_scotsman', - 'nytimes_sub', 'security_watch', 'cyberpresse', 'st_petersburg_times', + 'nytimes_sub', 'nytimes', 'security_watch', 'cyberpresse', 'st_petersburg_times', 'clarin', 'financial_times', 'heise', 'le_monde', 'harpers', 'science_aas', 'science_news', 'the_nation', 'lrb', 'harpers_full', 'liberation', 'linux_magazine', 'telegraph_uk', 'utne', 'sciencedaily', 'forbes', diff --git a/src/calibre/web/feeds/recipes/recipe_craigslist.py b/src/calibre/web/feeds/recipes/recipe_craigslist.py new file mode 100644 index 0000000000..bc4fd79131 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_craigslist.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.web.feeds.news import BasicNewsRecipe + +class CraigsList(BasicNewsRecipe): + title = u'craigslist - Best Of' + oldest_article = 365 + max_articles_per_feed = 100 + language = _('English') + __author__ = 'kiodane' + + feeds = [(u'Best of craigslist', + u'http://www.craigslist.org/about/best/all/index.rss'), (u'Ann Arbor', + u'http://www.craigslist.org/about/best/aaa/index.rss'), (u'Asheville', + u'http://www.craigslist.org/about/best/ash/index.rss'), (u'Austin', + u'http://www.craigslist.org/about/best/aus/index.rss'), (u'Baltimore', + u'http://www.craigslist.org/about/best/bal/index.rss'), (u'Birmingham', + u'http://www.craigslist.org/about/best/bhm/index.rss'), (u'Boston', + u'http://www.craigslist.org/about/best/bos/index.rss'), (u'Vermont', + u'http://www.craigslist.org/about/best/brl/index.rss'), (u'Columbia', + u'http://www.craigslist.org/about/best/cae/index.rss'), (u'Charlotte', + u'http://www.craigslist.org/about/best/cha/index.rss'), (u'Chico', + u'http://www.craigslist.org/about/best/chc/index.rss'), (u'Chicago', + u'http://www.craigslist.org/about/best/chi/index.rss'), (u'Charleston', + u'http://www.craigslist.org/about/best/chs/index.rss'), (u'Cleveland', + u'http://www.craigslist.org/about/best/cle/index.rss'), (u'Calgary', + u'http://www.craigslist.org/about/best/clg/index.rss'), + (u'Colorado Springs', u'http://www.craigslist.org/about/best/cos/index.rss'), + (u'Dallas', u'http://www.craigslist.org/about/best/dal/index.rss'), + (u'Denver', u'http://www.craigslist.org/about/best/den/index.rss'), + (u'Detroit Metro', u'http://www.craigslist.org/about/best/det/index.rss'), + (u'Des Moines', u'http://www.craigslist.org/about/best/dsm/index.rss'), + (u'Eau Claire', u'http://www.craigslist.org/about/best/eau/index.rss'), + (u'Grand Rapids', u'http://www.craigslist.org/about/best/grr/index.rss'), + (u'Hawaii', u'http://www.craigslist.org/about/best/hnl/index.rss'), + (u'Jacksonville', u'http://www.craigslist.org/about/best/jax/index.rss'), + (u'Knoxville', u'http://www.craigslist.org/about/best/knx/index.rss'), + (u'Kansas City', u'http://www.craigslist.org/about/best/ksc/index.rss'), + (u'South Florida', u'http://www.craigslist.org/about/best/mia/index.rss'), +(u'Minneapolis', u'http://www.craigslist.org/about/best/min/index.rss'), + (u'Maine', u'http://www.craigslist.org/about/best/mne/index.rss'), + (u'Montreal', u'http://www.craigslist.org/about/best/mon/index.rss'), + (u'Nashville', u'http://www.craigslist.org/about/best/nsh/index.rss'), + (u'New York', u'http://www.craigslist.org/about/best/nyc/index.rss'), + (u'Orange County', u'http://www.craigslist.org/about/best/orc/index.rss'), + (u'Portland', u'http://www.craigslist.org/about/best/pdx/index.rss'), + (u'Phoenix', u'http://www.craigslist.org/about/best/phx/index.rss'), + (u'Pittsburgh', u'http://www.craigslist.org/about/best/pit/index.rss'), + (u'Rhode Island', u'http://www.craigslist.org/about/best/prv/index.rss'), + (u'Raleigh', u'http://www.craigslist.org/about/best/ral/index.rss'), + (u'Rochester', u'http://www.craigslist.org/about/best/rcs/index.rss'), + (u'San Antonio', u'http://www.craigslist.org/about/best/sat/index.rss'), + (u'Santa Barbara', u'http://www.craigslist.org/about/best/sba/index.rss'), + (u'San Diego', u'http://www.craigslist.org/about/best/sdo/index.rss'), + (u'Seattle-Tacoma', u'http://www.craigslist.org/about/best/sea/index.rss'), + (u'Sf Bay Area', u'http://www.craigslist.org/about/best/sfo/index.rss'), + (u'Salt Lake City', + u'http://www.craigslist.org/about/best/slc/index.rss'), (u'Spokane', + u'http://www.craigslist.org/about/best/spk/index.rss'), (u'St Louis', + u'http://www.craigslist.org/about/best/stl/index.rss'), (u'Sydney', + u'http://www.craigslist.org/about/best/syd/index.rss'), (u'Toronto', + u'http://www.craigslist.org/about/best/tor/index.rss'), (u'Vancouver BC', + u'http://www.craigslist.org/about/best/van/index.rss'), (u'Washington DC', + u'http://www.craigslist.org/about/best/wdc/index.rss')] + diff --git a/src/calibre/web/feeds/recipes/recipe_nytimes.py b/src/calibre/web/feeds/recipes/recipe_nytimes.py index 9276ad667a..bd150bffcf 100644 --- a/src/calibre/web/feeds/recipes/recipe_nytimes.py +++ b/src/calibre/web/feeds/recipes/recipe_nytimes.py @@ -1,110 +1,241 @@ #!/usr/bin/env python -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal ' ''' -mobile.nytimes.com +nytimes.com ''' import re -from calibre.web.feeds.news import BasicNewsRecipe -from lxml import html +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag -class NYTimesMobile(BasicNewsRecipe): - - title = 'The New York Times' - __author__ = 'Kovid Goyal' +class NYTimes(BasicNewsRecipe): + + title = 'NYTimes Top Stories' + __author__ = 'Greg Riker' language = _('English') - description = 'Daily news from the New York Times (mobile version)' - timefmt = ' [%a, %d %b, %Y]' - multithreaded_fetch = True - max_articles_per_feed = 15 + description = 'Top Stories from the New York Times' + #max_articles_per_feed = 3 + timefmt = '' + needs_subscription = False + remove_tags_before = dict(id='article') + remove_tags_after = dict(id='article') + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', 'clearfix']}), + dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), + dict(name=['script', 'noscript', 'style'])] + encoding = 'cp1252' no_stylesheets = True - extra_css = ''' - .h1 { font-size: x-large; font-weight: bold; font-family: sans-serif; text-align: left } - .h2 { font-size: large; font-weight: bold } - .credit { font-size: small } - .aut { font-weight: bold } - .bodycontent { font-family: serif } - ''' - - remove_tags = [ - dict(name='div', attrs={'class':['banner center', 'greyBackBlackTop', 'c bB']}), - dict(name='a', href='/main') - ] - remove_tags_after = [ - dict(name='a', attrs={'name': 'bottom'}) - ] - - def image_url_processor(self, baseurl, url): - return re.sub(r'(&|&).*', '', url) - - def get_browser(self): - return BasicNewsRecipe.get_browser(mobile_browser=True) - - def download(self, for_lrf=False): - if for_lrf: - self.max_articles_per_feed = 10 - return BasicNewsRecipe.download(self, for_lrf=for_lrf) - - def process_section(self, href): - raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True) - articles = [] - while True: - root = html.fromstring(raw) - for art in self.find_articles(root): - append = True - for x in articles: - if x['title'] == art['title']: - append = False - break - if append: articles.append(art) - more = root.xpath('//a[starts-with(@href, "section") and contains(text(), "MORE")]') - if not more: - break - href = more[0].get('href') - raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True) - return articles - - - def find_articles(self, root): - for a in root.xpath('//a[@accesskey]'): - href = a.get('href') - if href.startswith('http://'): - url = href - else: - url = 'http://mobile.nytimes.com/article' + href[href.find('?'):]+'&single=1', - yield { - 'title': a.text.strip(), - 'date' : '', - 'url' : url, - 'description': '', - } - - + #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' + extra_css = '.headline {text-align:left;}\n\ + .byline {font:monospace; margin-bottom:0px;}\n\ + .source {align:left;}\n\ + .credit {align:right;}\n' + + + flatPeriodical = True + def parse_index(self): - raw = self.index_to_soup('http://mobile.nytimes.com', raw=True) - root = html.fromstring(raw) - feeds = [('Latest news', list(self.find_articles(root)))] - - for a in root.xpath('//a[starts-with(@href, "section")]'): - title = a.text.replace('»', '').replace(u'\xbb', '').strip() - print 'Processing section:', title - articles = self.process_section(a.get('href')) - feeds.append((title, articles)) - - return feeds - - def postprocess_html(self, soup, first_fetch): - for img in soup.findAll('img', width=True): - try: - width = int(img['width'].replace('px', '')) - if width < 5: - img.extract() - continue - except: - pass - del img['width'] - del img['height'] - del img.parent['style'] + soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') + + def feed_title(div): + return ''.join(div.findAll(text=True, recursive=False)).strip() + + articles = {} + + ans = [] + if self.flatPeriodical : + feed = key = 'All Top Stories' + articles[key] = [] + ans.append(key) + else : + key = None + + sections = { 'topstories' : 'Top Stories', + 'world' : 'World', + 'us' : 'U.S.', + 'politics' : 'Politics', + 'business' : 'Business', + 'technology' : 'Technology', + 'sports' : 'Sports', + 'arts' : 'Arts', + 'newyorkregion': 'New York/Region', + 'travel' : 'Travel', + 'editorials' : 'Editorials', + 'oped' : 'Op-Ed' + } + + #excludeSectionKeywords = ['World','U.S.', 'Politics','Business','Technology','Sports','Arts','New York','Travel', 'Editorials', 'Op-Ed'] + excludeSectionKeywords = [] + + # Fetch the outer table + table = soup.find('table') + previousTable = table + contentTable = None + + # Find the deepest table containing the stories + while True : + table = table.find('table') + if table.find(text=re.compile('top stories start')) : + if self.verbose > 2 : self.log( "*********** dropping one level deeper **************") + previousTable = table + continue + else : + if self.verbose > 2 : self.log( "found table with top stories") + table = previousTable + if self.verbose > 2 : self.log( "lowest table containing 'top stories start:\n%s" % table) + break + + # There are multiple subtables, find the one containing the stories + for block in table.findAll('table') : + if block.find(text=re.compile('top stories start')) : + if self.verbose > 2 : self.log( "found subtable with top stories") + table = block + if self.verbose > 2 : self.log( "lowest subtable containing 'top stories start:\n%s" % table) + break + else : + if self.verbose > 2 : self.log( "trying next subtable") + continue + + # Again there are multiple subtables, find the one containing the stories + for storyblock in table.findAll('table') : + if storyblock.find(text=re.compile('top stories start')) : + if self.verbose > 2 : self.log( "found subsubtable with top stories\n" ) + # table = storyblock + if self.verbose > 2 : self.log( "\nlowest subsubtable containing 'top stories start:\n%s" % storyblock) + break + else : + if self.verbose > 2 : self.log( "trying next subsubtable") + continue + + skipThisSection = False + + # Within this table are entries + for tr in storyblock.findAllNext('tr'): + if tr.find('span') is not None : + + sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif', + 'times new roman,times, sans serif', + 'times new roman, times, sans serif']}) + if self.verbose > 2 : self.log( "----------- new tr ----------------") + section = None + bylines = [] + descriptions = [] + pubdate = None + + # Get the Section title + for (x,i) in enumerate(sectionblock.contents) : + skipThisSection = False + # Extract the section title + if ('Comment' in str(i.__class__)) : + if 'start(name=' in i : + section = i[i.find('=')+1:-2] + if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section]) + + # Check for excluded section + if len(excludeSectionKeywords): + key = sections[section] + excluded = re.compile('|'.join(excludeSectionKeywords)) + if excluded.search(key) or articles.has_key(key): + if self.verbose > 2 : self.log("Skipping section %s" % key) + skipThisSection = True + break + + if not self.flatPeriodical : + articles[key] = [] + ans.append(key) + + # Get the bylines and descriptions + if not skipThisSection : + for (x,i) in enumerate(sectionblock.contents) : + + # Extract the bylines and descriptions + if (i.string is not None) and \ + (i.string.strip() > "") and \ + not ('Comment' in str(i.__class__)) : + + contentString = i.strip().encode('utf-8') + if contentString[0:3] == 'By ' : + bylines.append(contentString) + else : + descriptions.append(contentString) + + # Fetch the article titles and URLs + articleCount = len(sectionblock.findAll('span')) + for (i,span) in enumerate(sectionblock.findAll('span')) : + a = span.find('a', href=True) + #if not a: + #continue + url = re.sub(r'\?.*', '', a['href']) + url += '?pagewanted=all' + title = self.tag_to_string(a, use_alt=True) + if self.flatPeriodical : + # prepend the section name + title = sections[section] + " : " + title + if not isinstance(title, unicode): + title = title.decode('utf-8', 'replace') + description = descriptions[i] + if len(bylines) == articleCount : + author = bylines[i] + else : + author = None + + + if self.verbose > 2 : self.log( " title: %s" % title) + if self.verbose > 2 : self.log( " url: %s" % url) + if self.verbose > 2 : self.log( " author: %s" % author) + if self.verbose > 2 : self.log( "description: %s" % description) + + if not self.flatPeriodical : + feed = key + + if not articles.has_key(feed): + if self.verbose > 2 : self.log( "adding %s to articles[]" % feed) + articles[feed] = [] + if self.verbose > 2 : self.log( " adding: %s to articles[%s]\n" % (title, feed)) + articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, author=author, content='')) + + ans = self.sort_index_by(ans, {'Top Stories':-1}) + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + #sys.exit(1) + + return ans + + def postprocess_html(self,soup, True): + if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ") + + # Change captions to italic -1 + for caption in soup.findAll(True, {'class':'caption'}) : + emTag = Tag(soup, "em") + #emTag['class'] = "caption" + #emTag['font-size-adjust'] = "-1" + emTag.insert(0, caption.contents[0]) + hrTag = Tag(soup, 'hr') + emTag.insert(1, hrTag) + caption.replaceWith(emTag) + + + # Change to

+ headline = soup.div.div.div.div.div.h1.nyt_headline + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, headline.contents[0]) + soup.h1.replaceWith(tag) + return soup + + def postprocess_book(self, oeb, opts, log) : + log( " ********** recipe.postprocess_book ********** ") + log( list(oeb.toc) ) + log( "oeb: %s" % oeb.toc) + log( "opts: %s" % opts.verbose) + for sections in oeb.toc : + log( "section:") + for articleTOC in sections: + log( " title: %s" % articleTOC.title) + log( " author: %s" % articleTOC.author) + log( "description: %s" % articleTOC.description) + log( " href: %s" % articleTOC.href) + log( " content: %s" % oeb.manifest.hrefs[articleTOC.href]) + return