From cee22c8f3dabcb9f267efbe517f69caaa39ee26e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 20 Mar 2010 04:54:13 +0530 Subject: [PATCH 01/13] Improved recipe for USA Today --- resources/recipes/usatoday.recipe | 440 +++++++++++++++++++++++++++--- 1 file changed, 404 insertions(+), 36 deletions(-) diff --git a/resources/recipes/usatoday.recipe b/resources/recipes/usatoday.recipe index 1a314f652e..368437a709 100644 --- a/resources/recipes/usatoday.recipe +++ b/resources/recipes/usatoday.recipe @@ -7,62 +7,430 @@ usatoday.com ''' from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag import re class USAToday(BasicNewsRecipe): title = 'USA Today' - timefmt = ' [%d %b %Y]' - __author__ = 'Kovid Goyal and Sujata Raman' + __author__ = 'GRiker' + oldest_article = 1 + timefmt = '' max_articles_per_feed = 20 language = 'en' - - no_stylesheets = True - extra_css = ''' - .inside-head{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold } - .inside-head2{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold } - .inside-head3{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold } - h3{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold; } - h4{font-family:Arial,Helvetica,sans-serif; font-size:x-small; font-weight:bold; } - .side-by-side{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} - #byLineTag{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} - .inside-copy{font-family:Arial,Helvetica,sans-serif; font-size:x-small;text-align:left;} - .caption{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} - li{font-family:Arial,Helvetica,sans-serif; font-size:x-small;text-align:left ;} - .vatext{font-family:Arial,Helvetica,sans-serif; font-size:x-small;text-align:left ;} - .vaTextBold{font-family:Arial,Helvetica,sans-serif; font-size:x-small;font-weight:bold; color:#666666;} - ''' - remove_tags = [ - {'class':['tagListLabel','piped-taglist-string','socialcontainer','social-wrapper',]}, - {'id':['topSocialButtons']}, - ] - + extra_css = '.headline {text-align: left;}\n \ + .byline {font-family: monospace; \ + text-align: left; \ + margin-bottom: 1em;}\n \ + .image {text-align: center;}\n \ + .caption {text-align: center; \ + font-size: smaller; \ + font-style: italic}\n \ + .credit {text-align: right; \ + margin-bottom: 0em; \ + font-size: smaller;}\n \ + .articleBody {text-align: left;}\n ' conversion_options = { 'linearize_tables' : True } - - preprocess_regexps = [ - (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match : ''), - (re.compile(r'.*?', re.IGNORECASE | re.DOTALL), lambda match : ''), - ] - - + #simultaneous_downloads = 1 feeds = [ ('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'), - ('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'), ('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'), + ('Personal Tech', 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'), + ('Science', 'http://rssfeeds.usatoday.com/TP-ScienceFair'), + ('Health', 'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'), ('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'), ('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'), ('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'), + ('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'), ('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'), ('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'), + ('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'), ] + keep_only_tags = [dict(attrs={'class':[ + 'byLine', + 'inside-copy', + 'inside-head', + 'inside-head2', + 'item', + 'item-block', + 'photo-container', + ]}), + dict(id=[ + 'applyMainStoryPhoto', + 'permalink', + ])] - ## Getting the print version + remove_tags = [dict(attrs={'class':[ + 'comments', + 'jump', + 'pagetools', + 'post-attributes', + 'tags', + ]}), + dict(id=[])] - def print_version(self, url): - return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url + #feeds = [('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles')] + + def dump_hex(self, src, length=16): + ''' Diagnostic ''' + FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) + N=0; result='' + while src: + s,src = src[:length],src[length:] + hexa = ' '.join(["%02X"%ord(x) for x in s]) + s = s.translate(FILTER) + result += "%04X %-*s %s\n" % (N, length*3, hexa, s) + N+=length + print result + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + + return fixed + + def get_masthead_url(self): + masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nCover unavailable") + masthead = None + return masthead + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def parse_feeds(self, *args, **kwargs): + parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs) + # Count articles for progress dialog + content_feeds = [] + article_count = 0 + for feed in parsed_feeds: + article_count += len(feed) + self.log( "Queued %d articles" % article_count) + return parsed_feeds + + def preprocess_html(self, soup): + soup = self.strip_anchors(soup) + return soup def postprocess_html(self, soup, first_fetch): - for t in soup.findAll(['table', 'tr', 'td']): - t.name = 'div' + + # Remove navLinks
+ navLinks = soup.find(True,{'style':'padding-bottom:3px'}) + if navLinks: + navLinks.extract() + + # Remove
+ gibberish = soup.find(True,{'style':'margin-bottom:10px'}) + if gibberish: + gibberish.extract() + + # Change to

+ headline = soup.find(True, {'class':['inside-head','inside-head2']}) + if not headline: + headline = soup.find('h3') + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, headline.contents[0]) + headline.replaceWith(tag) + else: + print "unable to find headline:\n%s\n" % soup + + # Change byLine to byline, change commas to middot + # Kindle renders commas in byline as '&' + byline = soup.find(True, {'class':'byLine'}) + if byline: + byline['class'] = 'byline' + # Replace comma with middot + byline.contents[0].replaceWith(re.sub(","," ·", byline.renderContents())) + + jumpout_punc_list = [':','?'] + # Remove the inline jumpouts in
+ paras = soup.findAll(True, {'class':'inside-copy'}) + for para in paras: + if re.match("[\w\W]+ ",para.renderContents()): + p = para.find('b') + for punc in jumpout_punc_list: + punc_offset = p.contents[0].find(punc) + if punc_offset == -1: + continue + if punc_offset > 1: + if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper(): + #print "extracting \n%s\n" % para.prettify() + para.extract() + + # Reset class for remaining + paras = soup.findAll(True, {'class':'inside-copy'}) + for para in paras: + para['class'] = 'articleBody' + + # Remove inline jumpouts in

+ paras = soup.findAll(['p']) + for p in paras: + if hasattr(p,'contents') and len(p.contents): + for punc in jumpout_punc_list: + punc_offset = p.contents[0].find(punc) + if punc_offset == -1: + continue + if punc_offset > 2 and hasattr(p,'a') and len(p.contents): + #print "evaluating %s\n" % p.contents[0][:punc_offset+1] + if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper(): + #print "extracting \n%s\n" % p.prettify() + p.extract() + + # Capture the first img, insert after headline + imgs = soup.findAll('img') + print "postprocess_html(): %d images" % len(imgs) + if imgs: + divTag = Tag(soup, 'div') + divTag['class'] = 'image' + body = soup.find('body') + img = imgs[0] + #print "img: \n%s\n" % img.prettify() + + # Table for photo and credit + tableTag = Tag(soup,'table') + + # Photo + trimgTag = Tag(soup, 'tr') + tdimgTag = Tag(soup, 'td') + tdimgTag.insert(0,img) + trimgTag.insert(0,tdimgTag) + tableTag.insert(0,trimgTag) + + # Credit + trcreditTag = Tag(soup, 'tr') + + tdcreditTag = Tag(soup, 'td') + tdcreditTag['class'] = 'credit' + credit = soup.find('td',{'class':'photoCredit'}) + if credit: + tdcreditTag.insert(0,NavigableString(credit.renderContents())) + else: + credit = img['credit'] + if credit: + tdcreditTag.insert(0,NavigableString(credit)) + else: + tdcreditTag.insert(0,NavigableString('')) + + trcreditTag.insert(0,tdcreditTag) + tableTag.insert(1,trcreditTag) + dtc = 0 + divTag.insert(dtc,tableTag) + dtc += 1 + + if False: + # Add the caption in the table + tableCaptionTag = Tag(soup,'caption') + tableCaptionTag.insert(0,soup.find('td',{'class':'photoCredit'}).renderContents()) + tableTag.insert(1,tableCaptionTag) + divTag.insert(dtc,tableTag) + dtc += 1 + body.insert(1,divTag) + else: + # Add the caption below the table + #print "Looking for caption in this soup:\n%s" % img.prettify() + captionTag = Tag(soup,'p') + captionTag['class'] = 'caption' + if hasattr(img,'alt') and img['alt']: + captionTag.insert(0,NavigableString('

%s
' % img['alt'])) + divTag.insert(dtc, captionTag) + dtc += 1 + else: + try: + captionTag.insert(0,NavigableString('
%s
' % img['cutline'])) + divTag.insert(dtc, captionTag) + dtc += 1 + except: + pass + + hrTag = Tag(soup, 'hr') + divTag.insert(dtc, hrTag) + dtc += 1 + + # Delete
- restructure + insert_loc = 0 + tag = body.find(True) + while True: + insertLoc += 1 + try: + if hasattr(tag,'class') and tag['class'] == 'headline': + headline_found = True + tag.insert(insertLoc,divTag) + break + except: + pass + tag = tag.next + if not tag: + break + + # Yank out headline, img and caption + headline = body.find('h2','headline') + img = body.find('div','image') + caption = body.find('p''class') + + # body(0) is calibre_navbar + # body(1) is
+ + btc = 1 + headline.extract() + body.insert(1, headline) + btc += 1 + if img: + img.extract() + body.insert(btc, img) + btc += 1 + if caption: + caption.extract() + body.insert(btc, caption) + btc += 1 + + if len(imgs) > 1: + if True: + [img.extract() for img in imgs[1:]] + else: + # Format the remaining images + # This doesn't work yet + for img in imgs[1:]: + print "img:\n%s\n" % img.prettify() + divTag = Tag(soup, 'div') + divTag['class'] = 'image' + + # Table for photo and credit + tableTag = Tag(soup,'table') + + # Photo + trimgTag = Tag(soup, 'tr') + tdimgTag = Tag(soup, 'td') + tdimgTag.insert(0,img) + trimgTag.insert(0,tdimgTag) + tableTag.insert(0,trimgTag) + + # Credit + trcreditTag = Tag(soup, 'tr') + + tdcreditTag = Tag(soup, 'td') + tdcreditTag['class'] = 'credit' + try: + tdcreditTag.insert(0,NavigableString(img['credit'])) + except: + tdcreditTag.insert(0,NavigableString('')) + trcreditTag.insert(0,tdcreditTag) + tableTag.insert(1,trcreditTag) + divTag.insert(0,tableTag) + soup.img.replaceWith(divTag) + + return soup + + def postprocess_book(self, oeb, opts, log) : + + def extract_byline(href) : + # '' : + return self.massageNCXText(self.tag_to_string(p,use_alt=False)) + else: + print "Didn't find
in this soup:\n%s" % soup.prettify() + return None + + # Method entry point here + # Single section toc looks different than multi-section tocs + if oeb.toc.depth() == 2 : + for article in oeb.toc : + if article.author is None : + article.author = extract_byline(article.href) + if article.description is None : + article.description = extract_description(article.href) + elif oeb.toc.depth() == 3 : + for section in oeb.toc : + for article in section : + article.author = extract_byline(article.href) + ''' + if article.author is None : + article.author = self.massageNCXText(extract_byline(article.href)) + else: + article.author = self.massageNCXText(article.author) + ''' + if article.description is None : + article.description = extract_description(article.href) + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup From 37002f9b912cabb332c7f4de0532562c1248a831 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 20 Mar 2010 05:00:08 +0530 Subject: [PATCH 02/13] Tulsa World by Darko Miletic --- resources/images/news/tulsaworld.png | Bin 0 -> 995 bytes resources/recipes/tulsaworld.recipe | 47 +++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 resources/images/news/tulsaworld.png create mode 100644 resources/recipes/tulsaworld.recipe diff --git a/resources/images/news/tulsaworld.png b/resources/images/news/tulsaworld.png new file mode 100644 index 0000000000000000000000000000000000000000..dc68d0726603a39338d163dbf0d93a3d42a0d843 GIT binary patch literal 995 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87?^oHT^vI!PA|QDGGCfi_g~TQMhVq)KUoZBm|9wzD?Sa)g^^QY#kadv>+(v&xLpO`3>h9&&I`}pDYHuS&oNm!nUqY);}fTfJT02G zbH+Hc?4x4Tc*e7gC(-~58k`RWLbNkzt2w&c$GWT^B|nD%M4=OYbd8zI8+I#6*R|liIUiX$0}# zzVzEZqkPwmX-C&i71{mNKt8^+@bOzqp5hLd8>x1-XV|CDdvqnECGWbEKy8W6#Jx33 z>ksWLkvX4raqIFQl}nFCZ9Vk*x$Cp@8ZFGWN{o~HVx6|fNLBSu{>ZrJpg_I&?AnCy z3yyBAm~zu|X42Z;Rkw5AaGyC}HOEiNV%mxMuj?n)DLFZRov5Ex5^d@dzkz?vyAX-) zpPnwvKU^DMPIMGtaeM5n@`R)9t+MUIi+caPj6&4Y%QUWA&9S$>&MtIO;cJ?7q}PHL zmu2owKmOKPxy`koR)5?`_R~(8nK^BY%a1P+a=M%nxBEwqe8u~x#x<42Q?C>>EXxa6 z6*WgH`~uI7`%y`SwVbD48A?o;(6M2$r;l%6@zb^8zYcsZ_6&RPUGp(`+64ZzV~aH! z^(>V9m&G{CO*FEMu6sV#RYEt(jcc#v`T0k_|-JJUysDZ)L L)z4*}Q$iB}hWNCN literal 0 HcmV?d00001 diff --git a/resources/recipes/tulsaworld.recipe b/resources/recipes/tulsaworld.recipe new file mode 100644 index 0000000000..bdb6969853 --- /dev/null +++ b/resources/recipes/tulsaworld.recipe @@ -0,0 +1,47 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +tulsaworld.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class TulsaWorld(BasicNewsRecipe): + title = 'Tulsa World' + __author__ = 'Darko Miletic' + description = 'Find breaking news, local news, Oklahoma weather, sports, business, entertainment, lifestyle, opinion, government, movies, books, jobs, education, blogs, video & multimedia.' + publisher = 'World Publishing Co.' + category = 'Tulsa World, tulsa world, daily newspaper, breaking news, stories, articles, news, local, weather, coverage, editorial, government, education, community, sports, business, entertainment, lifestyle, opinion, multimedia, media, blogs, consumer, OU, OSU, TU, ORU, football, basketball, school, schools, sudoku, movie reviews, stocks, classified ads, classifieds, books, job, jobs, careers, real estate, home, homes, Oklahoma, northeastern, reviews, auto, autos, archives, forecasts, Sooners, Cowboys, Hurricane, Golden Eagles, NFL, NBA, MLB, pro football, scores, college basketball, college football, college baseball, sports columns, fashion and style, associated press, regional news coverage, health, obituaries, politics, political news, Jenks, Union, Owasso, Tulsa, Booker T. Washington, Trojans, Rams, Hornets, video, photography, photos, images, games, search, the picker, predictions, satellite, family, food, teens, polls, births, celebrations, death notices, divorces, marriages, obituaries, audio, podcasts.' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en' + country = 'US' + remove_empty_feeds = True + masthead_url = 'http://www.tulsaworld.com/images/TW_logo-blue-footer.jpg' + extra_css = ' body{font-family: Arial,Verdana,sans-serif } img{margin-bottom: 0.4em} .articleHeadline{font-size: xx-large; font-weight: bold} .articleKicker{font-size: x-large; font-weight: bold} .articleByline,.articleDate{font-size: small} .leadp{font-size: 1.1em} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + keep_only_tags = [dict(name='div',attrs={'id':['ctl00_body1_ArticleControl_divArticleText','ctl00_BodyContent_ArticleControl_divArticleText']})] + + feeds = [ + (u'News' , u'http://www.tulsaworld.com/site/rss.aspx?group=1') + ,(u'Business', u'http://www.tulsaworld.com/site/rss.aspx?group=5') + ,(u'Opinion' , u'http://www.tulsaworld.com/site/rss.aspx?group=7') + ] + + def get_article_url(self, article): + return article.get('link', None).rpartition('&rss')[0] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return self.adeify_images(soup) \ No newline at end of file From a72fd264fe048912f473dc1c8eaf81dfdb394536 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 20 Mar 2010 15:50:08 -0400 Subject: [PATCH 03/13] Fix FB2 TOC links. --- .bzrignore | 4 ++++ src/calibre/ebooks/fb2/fb2ml.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.bzrignore b/.bzrignore index a3257f7de3..6adb047922 100644 --- a/.bzrignore +++ b/.bzrignore @@ -22,3 +22,7 @@ src/cssutils/stylesheets/.svn/ src/odf/.svn tags nbproject/ +*.mdproj +*.pidb +*.sln +*.userprefs diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index c8428cf136..d61f4369e6 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -230,8 +230,8 @@ class FB2MLizer(object): if '://' in href: fb2_text.append('' % href) else: - if '#' not in href: - href += '#' + if href.startswith('#'): + href = href[1:] if href not in self.link_hrefs.keys(): self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) href = self.link_hrefs[href] From 8af795d9dd34ce97bfa9ca3f07afd3553f64ff9d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Mar 2010 06:58:02 +0530 Subject: [PATCH 04/13] Fix NYTimes --- resources/recipes/nytimes_sub.recipe | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index c126902899..78f6016c94 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -31,7 +31,7 @@ class NYTimes(BasicNewsRecipe): # List of sections to exclude # To add a section, copy the section name from the allSectionKeywords list above # For example, to exclude 'Dining' and 'Weddings': - # excludeSectionKeywords = ['Dining','Weddings'] + #excludeSectionKeywords = ['Dining','Weddings'] excludeSectionKeywords = [] # List of sections to include (test and debug only) @@ -56,9 +56,12 @@ class NYTimes(BasicNewsRecipe): remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') remove_tags = [dict(attrs={'class':[ + 'articleFooter', + 'articleInline runaroundLeft', 'articleTools', 'columnGroup doubleRule', 'columnGroup last', + 'columnGroup last', 'doubleRule', 'dottedLine', 'entry-meta', @@ -70,6 +73,7 @@ class NYTimes(BasicNewsRecipe): 'relatedSearchesModule', 'side_tool', 'singleAd', + 'subNavigation tabContent active clearfix', ]}), dict(id=[ 'adxLeaderboard', @@ -222,11 +226,11 @@ class NYTimes(BasicNewsRecipe): if div['class'] == 'section-headline': key = string.capwords(feed_title(div)) - excluded = re.compile('|'.join(self.excludeSectionKeywords)) - if excluded.search(key): - self.log("Skipping section %s" % key) - continue - + if len(self.excludeSectionKeywords): + excluded = re.compile('|'.join(self.excludeSectionKeywords)) + if excluded.search(key): + self.log("Skipping section %s" % key) + continue articles[key] = [] ans.append(key) From 70d9a6d3d60b99276467832809f988fe862fa6fb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Mar 2010 07:20:57 +0530 Subject: [PATCH 05/13] Don't choke if the user provides an invalid remove header/footer regexp --- src/calibre/ebooks/conversion/preprocess.py | 25 +++++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 29ce0e4296..ada4f1a3af 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -230,14 +230,25 @@ class HTMLPreProcessor(object): end_rules = [] if getattr(self.extra_opts, 'remove_header', None): - end_rules.append( - (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '') - ) + try: + end_rules.append( + (re.compile(self.extra_opts.header_regex), lambda match : '') + ) + except: + import traceback + print 'Failed to parse remove_header regexp' + traceback.print_exc() + if getattr(self.extra_opts, 'remove_footer', None): - end_rules.append( - (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') - ) - + try: + end_rules.append( + (re.compile(self.extra_opts.footer_regex), lambda match : '') + ) + except: + import traceback + print 'Failed to parse remove_footer regexp' + traceback.print_exc() + if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) if length: From 91f3f2d8b8addbaef9975b0eb7b53d8abfa0b44d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 22 Mar 2010 06:43:16 +0530 Subject: [PATCH 06/13] More intelligent error message when user selects wrong card for send to device operation on SONYs and improved nyimes --- resources/recipes/nytimes_sub.recipe | 5 +++-- src/calibre/devices/prs505/driver.py | 9 +++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index 78f6016c94..93df08220d 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -57,9 +57,9 @@ class NYTimes(BasicNewsRecipe): remove_tags_after = dict(id='article') remove_tags = [dict(attrs={'class':[ 'articleFooter', - 'articleInline runaroundLeft', 'articleTools', 'columnGroup doubleRule', + 'columnGroup singleRule', 'columnGroup last', 'columnGroup last', 'doubleRule', @@ -68,6 +68,7 @@ class NYTimes(BasicNewsRecipe): 'icon enlargeThis', 'leftNavTabs', 'module box nav', + 'nextArticleLink', 'nextArticleLink clearfix', 'post-tools', 'relatedSearchesModule', @@ -226,7 +227,7 @@ class NYTimes(BasicNewsRecipe): if div['class'] == 'section-headline': key = string.capwords(feed_title(div)) - if len(self.excludeSectionKeywords): + if self.excludeSectionKeywords: excluded = re.compile('|'.join(self.excludeSectionKeywords)) if excluded.search(key): self.log("Skipping section %s" % key) diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index 5d759be47c..448965a913 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -150,7 +150,8 @@ class PRS505(CLI, Device): for location in locations: info = metadata.next() path = location[0] - blist = 2 if location[3] == 'cardb' else 1 if location[3] == 'carda' else 0 + oncard = location[3] + blist = 2 if oncard == 'cardb' else 1 if oncard == 'carda' else 0 if self._main_prefix and path.startswith(self._main_prefix): name = path.replace(self._main_prefix, '') @@ -166,7 +167,11 @@ class PRS505(CLI, Device): opts = self.settings() collections = opts.extra_customization.split(',') if opts.extra_customization else [] - booklists[blist].add_book(info, name, collections, *location[1:-1]) + booklist = booklists[blist] + if not hasattr(booklist, 'add_book'): + raise ValueError(('Incorrect upload location %s. Did you choose the' + ' correct card A or B, to send books to?')%oncard) + booklist.add_book(info, name, collections, *location[1:-1]) fix_ids(*booklists) def delete_books(self, paths, end_session=True): From 3a982d51b0d062a965aeac679ffd334f00490a2a Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 22 Mar 2010 19:25:24 -0400 Subject: [PATCH 07/13] Fix bug #5149: PML, external links added incorrectly. --- src/calibre/ebooks/pml/pmlml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 3df24fcc86..e076148360 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -260,8 +260,8 @@ class PMLMLizer(object): href += '#' if href not in self.link_hrefs.keys(): self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) - href = self.link_hrefs[href] - text.append('\\q="#%s"' % href) + href = '#%s' % self.link_hrefs[href] + text.append('\\q="%s"' % href) tags.append('q') # Anchor ids From fdaed4a1690298d20e49034656fc3e65ca5412b4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Mar 2010 09:46:54 +0530 Subject: [PATCH 08/13] IEEE Spectrum by Franco Venturi --- resources/recipes/ieeespectrum.recipe | 67 +++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 resources/recipes/ieeespectrum.recipe diff --git a/resources/recipes/ieeespectrum.recipe b/resources/recipes/ieeespectrum.recipe new file mode 100644 index 0000000000..79a107cd9d --- /dev/null +++ b/resources/recipes/ieeespectrum.recipe @@ -0,0 +1,67 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010, Franco Venturi ' +''' +spectrum.ieee.org +''' + +from calibre.web.feeds.news import BasicNewsRecipe +from string import capwords +from urlparse import urljoin + +class IEEESpectrum(BasicNewsRecipe): + title = 'IEEE Spectrum' + __author__ = 'Franco Venturi' + description = 'Electronics News from IEEE' + publisher = 'IEEE' + category = 'news, electronics, IT, computer science' + oldest_article = 32 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + language = 'en' + index = 'http://spectrum.ieee.org/magazine/' + masthead_url = 'http://spectrum.ieee.org/images/logo_hdr.png' + + remove_javascript = True + remove_tags = [dict(name={'script':True, 'object':True})] + remove_attributes = ['height','width','alt'] + keep_only_tags = [dict(attrs={'class': {'artSctn':True, 'artTitle':True, 'dekTitle': True}}), dict(attrs={'id':'artBody'})] + + +# def get_cover_url(self): +# cover_url = None +# soup = self.index_to_soup(self.index) +# cover_item = soup.find('img',attrs={'image':'cover.gif'}) +# if cover_item: +# cover_url = urljoin(self.index, cover_item['src']) +# return cover_url + + def parse_index(self): + soup = self.index_to_soup(self.index) + content = soup.find(id='gnrlContent') + title = content.find(attrs={'class':'style4'}).string.strip() + date = ' '.join(title.split()[0:2]) + self.timefmt = ' [' + date + ']' + contents = [] + for tag in content.findAll(attrs={'class': {'style2':True, 'lstngTitle':True, 'lstngBody': True}}): + if tag['class'] == 'style2': + contents.append((capwords(tag.renderContents().strip()), [])) + elif tag['class'] == 'lstngTitle': + url = urljoin(self.index, tag.findPrevious('a')['href']) + '/0' + contents[-1][1].append({'title': tag.renderContents().strip(), + 'url': url, + 'date': date, + 'description': '', + 'content': '' + }) + elif tag['class'] == 'lstngBody': + contents[-1][1][-1]['description'] = tag.renderContents().strip() + + return contents + + def preprocess_html(self, soup): + for a in soup.findAll('a'): + if not a['href'].lower().startswith('http'): + a['href'] = urljoin(self.index, a['href']) + return soup From 69ec0e1ee56699429b4b5ee4b9cb2bddccaec070 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Mar 2010 11:36:00 +0530 Subject: [PATCH 09/13] ... --- resources/recipes/ieeespectrum.recipe | 12 ++++-------- src/calibre/web/feeds/news.py | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/resources/recipes/ieeespectrum.recipe b/resources/recipes/ieeespectrum.recipe index 79a107cd9d..e2490b2a6c 100644 --- a/resources/recipes/ieeespectrum.recipe +++ b/resources/recipes/ieeespectrum.recipe @@ -29,16 +29,12 @@ class IEEESpectrum(BasicNewsRecipe): keep_only_tags = [dict(attrs={'class': {'artSctn':True, 'artTitle':True, 'dekTitle': True}}), dict(attrs={'id':'artBody'})] -# def get_cover_url(self): -# cover_url = None -# soup = self.index_to_soup(self.index) -# cover_item = soup.find('img',attrs={'image':'cover.gif'}) -# if cover_item: -# cover_url = urljoin(self.index, cover_item['src']) -# return cover_url - def parse_index(self): soup = self.index_to_soup(self.index) + img = soup.find('img', image='cover.gif', src=True) + if img is not None: + self.cover_url = 'http://spectrum.ieee.org'+img['src'] + content = soup.find(id='gnrlContent') title = content.find(attrs={'class':'style4'}).string.strip() date = ' '.join(title.split()[0:2]) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index d07c135abd..496a1f4d5b 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -864,10 +864,10 @@ class BasicNewsRecipe(Recipe): self.log.error(_('Could not download cover: %s')%str(err)) self.log.debug(traceback.format_exc()) if cu is not None: - ext = cu.rpartition('.')[-1] + ext = cu.split('/')[-1].rpartition('.')[-1] if '?' in ext: ext = '' - ext = ext.lower() if ext else 'jpg' + ext = ext.lower() if ext and '/' not in ext else 'jpg' cpath = os.path.join(self.output_dir, 'cover.'+ext) if os.access(cu, os.R_OK): with open(cpath, 'wb') as cfile: From c1a6be45b95c1f46de1ed7872b6d49fa8e688ed5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Mar 2010 08:47:31 +0530 Subject: [PATCH 10/13] ... --- resources/recipes/times_online.recipe | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/resources/recipes/times_online.recipe b/resources/recipes/times_online.recipe index 98c26e6a66..98e96552ce 100644 --- a/resources/recipes/times_online.recipe +++ b/resources/recipes/times_online.recipe @@ -21,9 +21,8 @@ class Timesonline(BasicNewsRecipe): use_embedded_content = False simultaneous_downloads = 1 encoding = 'ISO-8859-1' - lang = 'en-UK' remove_javascript = True - language = 'en' + language = 'en_GB' recursions = 9 match_regexps = [r'http://www.timesonline.co.uk/.*page=[2-9]'] From 08c7ed70fae8c76e628e7b6290309051a83b4a88 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 25 Mar 2010 07:54:12 +0530 Subject: [PATCH 11/13] New York Post by Darko Miletic --- resources/images/news/nypost.png | Bin 0 -> 400 bytes resources/recipes/nypost.recipe | 36 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 resources/images/news/nypost.png create mode 100644 resources/recipes/nypost.recipe diff --git a/resources/images/news/nypost.png b/resources/images/news/nypost.png new file mode 100644 index 0000000000000000000000000000000000000000..f9a93cfbb3dfcb73e2d8c877b32b870ee497f203 GIT binary patch literal 400 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*VRoSmL7jv*GO*9HgjwJPweK32VPc4XeQxjFBh z-5qM5JySB zc@`L|2lb|$&R(lKouyp9dP4isDaY>ZR=Al}#eQSHp-0;i4~CAv79QJXwJez+QXKVi z_7~nXwsQBiqG{W9PB76>tzKX8Jk2w-oc-kF$t5@5_s=)4i>>7s@?Q1rd}YbRpI7Z? zCMExu)<1J-bLKI2tyh<~x&EjT26{rZ#5JNMC9x#cD!C{XNHG{07#ipr80#9Dh8UPz znHpJ{SZEuVS{WGpe-LMnq9HdwB{QuOp}{1?$iT|T#L5(?!3?Ou?Nfa)Py>UftDnm{ Hr-UW|jR=gl literal 0 HcmV?d00001 diff --git a/resources/recipes/nypost.recipe b/resources/recipes/nypost.recipe new file mode 100644 index 0000000000..694f5b04d2 --- /dev/null +++ b/resources/recipes/nypost.recipe @@ -0,0 +1,36 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +nypost.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class NYPost(BasicNewsRecipe): + title = 'New York Post' + __author__ = 'Darko Miletic' + description = 'Daily newspaper' + publisher = 'NYP Holdings, Inc.' + category = 'news, politics, USA' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en' + masthead_url = 'http://www.nypost.com/rw/SysConfig/WebPortal/nypost/images/nyp_logo_230x32.gif' + extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + keep_only_tags=[dict(name='div', attrs={'id':'story'})] + + feeds = [(u'Articles', u'http://www.nypost.com/rss/all_section.xml')] + + def print_version(self, url): + return url.replace('nypost.com/p/','nypost.com/f/print/') From b6c27b73597ae7e47b6b6e591476de7236498e40 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 25 Mar 2010 08:21:13 +0530 Subject: [PATCH 12/13] SONY driver: Tags within [] do not translate into a category --- src/calibre/devices/prs505/books.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/calibre/devices/prs505/books.py b/src/calibre/devices/prs505/books.py index 623f6c19ad..9d943bd8e6 100644 --- a/src/calibre/devices/prs505/books.py +++ b/src/calibre/devices/prs505/books.py @@ -226,11 +226,19 @@ class BookList(_BookList): for item in collections: item = item.strip() mitem = getattr(mi, item, None) + titems = [] if mitem: if isinstance(mitem, list): - tags.extend(mitem) + titems = mitem else: - tags.append(mitem) + titems = [mitem] + if item == 'tags' and titems: + litems = [] + for i in titems: + if not i.strip().startswith('[') and not i.strip().endswith(']'): + litems.append(i) + titems = litems + tags.extend(titems) if tags: tags = list(set(tags)) if hasattr(mi, 'tag_order'): From 656ce3eac8e2287ca6086d08cacceae7116a652b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 26 Mar 2010 09:00:11 +0530 Subject: [PATCH 13/13] Update New York Times Top Stories --- resources/recipes/nytimes.recipe | 386 +++++++++++++++++--------- src/calibre/devices/android/driver.py | 2 +- 2 files changed, 260 insertions(+), 128 deletions(-) diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe index 32e5a4825e..3b9d2858e6 100644 --- a/resources/recipes/nytimes.recipe +++ b/resources/recipes/nytimes.recipe @@ -5,7 +5,8 @@ __copyright__ = '2008, Kovid Goyal ' ''' nytimes.com ''' -import re, time +import re +import time from calibre import entity_to_unicode from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment @@ -14,7 +15,7 @@ class NYTimes(BasicNewsRecipe): title = 'New York Times Top Stories' __author__ = 'GRiker' - language = 'en' + language = _('English') description = 'Top Stories from the New York Times' # List of sections typically included in Top Stories. Use a keyword from the @@ -39,9 +40,6 @@ class NYTimes(BasicNewsRecipe): 'world' : 'World' } - # By default, no sections are skipped. - excludeSectionKeywords = [] - # Add section keywords from the right column above to skip that section # For example, to skip sections containing the word 'Sports' or 'Dining', use: # excludeSectionKeywords = ['Sports', 'Dining'] @@ -49,36 +47,138 @@ class NYTimes(BasicNewsRecipe): # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World'] # Fetch only Top Stories # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World'] + # By default, no sections are skipped. + excludeSectionKeywords = [] + + # one_picture_per_article specifies that calibre should only use the first image + # from an article (if one exists). If one_picture_per_article = True, the image + # will be moved to a location between the headline and the byline. + # If one_picture_per_article = False, all images from the article will be included + # and shown in their original location. + one_picture_per_article = True # The maximum number of articles that will be downloaded max_articles_per_feed = 40 timefmt = '' needs_subscription = True - keep_only_tags = [ dict(attrs={ 'id':['article']}), - dict(attrs={'class':['blog wrap']}) ] + masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - remove_tags = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix', - 'inlineVideo left brightcove', 'entry-meta']}), - dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles', - 'portfolioInline','articleInline','readerscomment', - 'nytRating']}) ] + remove_tags_before = dict(id='article') + remove_tags_after = dict(id='article') + remove_tags = [dict(attrs={'class':[ + 'articleFooter', + 'articleTools', + 'columnGroup doubleRule', + 'columnGroup singleRule', + 'columnGroup last', + 'columnGroup last', + 'doubleRule', + 'dottedLine', + 'entry-meta', + 'icon enlargeThis', + 'leftNavTabs', + 'module box nav', + 'nextArticleLink', + 'nextArticleLink clearfix', + 'post-tools', + 'relatedSearchesModule', + 'side_tool', + 'singleAd', + 'subNavigation tabContent active clearfix', + ]}), + dict(id=[ + 'adxLeaderboard', + 'archive', + 'articleExtras', + 'articleInline', + 'blog_sidebar', + 'cCol', + 'entertainmentSearchBar', + 'footer', + 'header', + 'header_search', + 'login', + 'masthead', + 'memberTools', + 'navigation', + 'portfolioInline', + 'relatedArticles', + 'side_search', + 'side_index', + 'side_tool', + 'toolsRight', + ]), + dict(name=['script', 'noscript', 'style'])] - encoding = 'cp1252' no_stylesheets = True extra_css = '.headline {text-align: left;}\n \ .byline {font-family: monospace; \ text-align: left; \ + margin-top: 0px; \ + margin-bottom: 0px;}\n \ + .dateline {font-size: small; \ + margin-top: 0px; \ + margin-bottom: 0px;}\n \ + .timestamp {font-size: small; \ + margin-top: 0px; \ margin-bottom: 0px;}\n \ - .timestamp {font-size: smaller;}\n \ .source {text-align: left;}\n \ .image {text-align: center;}\n \ .credit {text-align: right; \ - font-size: smaller;}\n \ + font-size: small; \ + margin-top: 0px; \ + margin-bottom: 0px;}\n \ .articleBody {text-align: left;}\n \ .authorId {text-align: left; \ font-style: italic;}\n ' + def dump_ans(self, ans) : + total_article_count = 0 + for section in ans : + if self.verbose: + self.log("section %s: %d articles" % (section[0], len(section[1])) ) + for article in section[1]: + total_article_count += 1 + if self.verbose: + self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), + article['url'].encode('cp1252','replace'))) + self.log( "Queued %d articles" % total_article_count ) + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + + return fixed + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + try: + br.open('http://www.nytimes.com/auth/login') + br.select_form(name='login') + br['USERID'] = self.username + br['PASSWORD'] = self.password + br.submit() + except: + self.log("\nFailed to login") + return br + def get_cover_url(self): cover = None st = time.localtime() @@ -94,26 +194,6 @@ class NYTimes(BasicNewsRecipe): cover = None return cover - def get_masthead_url(self): - masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - br = BasicNewsRecipe.get_browser() - try: - br.open(masthead) - except: - self.log("\nCover unavailable") - masthead = None - return masthead - - def get_browser(self): - br = BasicNewsRecipe.get_browser() - if self.username is not None and self.password is not None: - br.open('http://www.nytimes.com/auth/login') - br.select_form(name='login') - br['USERID'] = self.username - br['PASSWORD'] = self.password - br.submit() - return br - def index_to_soup(self, url_or_raw, raw=False): ''' OVERRIDE of class method @@ -138,6 +218,7 @@ class NYTimes(BasicNewsRecipe): return BeautifulSoup(_raw, markupMassage=massage) # Entry point + print "index_to_soup()" soup = get_the_soup( self.encoding, url_or_raw ) contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] @@ -151,6 +232,16 @@ class NYTimes(BasicNewsRecipe): return soup + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + def parse_index(self): articles = {} ans = [] @@ -158,12 +249,14 @@ class NYTimes(BasicNewsRecipe): feed = key = 'All Top Stories' articles[key] = [] ans.append(key) + self.log("Scanning 1 section ...") soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') # Fetch the outer table table = soup.find('table') previousTable = table + contentTable = None # Find the deepest table containing the stories while True : @@ -191,8 +284,9 @@ class NYTimes(BasicNewsRecipe): continue skipThisSection = False - + todays_article_count = 0 # Within this table are entries + self.log("Fetching feed Top Stories") for tr in storyblock.findAllNext('tr'): if tr.find('span') is not None : @@ -244,6 +338,7 @@ class NYTimes(BasicNewsRecipe): # Fetch the article titles and URLs articleCount = len(sectionblock.findAll('span')) + todays_article_count += articleCount for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) : a = span.find('a', href=True) url = re.sub(r'\?.*', '', a['href']) @@ -277,6 +372,7 @@ class NYTimes(BasicNewsRecipe): if duplicateFound: # Continue fetching, don't add this article + todays_article_count -= 1 continue if not articles.has_key(feed): @@ -284,11 +380,138 @@ class NYTimes(BasicNewsRecipe): articles[feed].append( dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) +# self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories")) ans = self.sort_index_by(ans, {'Top Stories':-1}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + self.dump_ans(ans) return ans + def preprocess_html(self, soup): + return self.strip_anchors(soup) + + def postprocess_html(self,soup, True): + + if self.one_picture_per_article: + # Remove all images after first + largeImg = soup.find(True, {'class':'articleSpanImage'}) + inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) + if largeImg: + for inlineImg in inlineImgs: + inlineImg.extract() + else: + if inlineImgs: + firstImg = inlineImgs[0] + for inlineImg in inlineImgs[1:]: + inlineImg.extract() + # Move firstImg after headline + cgFirst = soup.find(True, {'class':'columnGroup first'}) + if cgFirst: + # Strip all sibling NavigableStrings: noise + navstrings = cgFirst.findAll(text=True, recursive=False) + [ns.extract() for ns in navstrings] + headline_found = False + tag = cgFirst.find(True) + insertLoc = 0 + while True: + insertLoc += 1 + if hasattr(tag,'class') and tag['class'] == 'articleHeadline': + headline_found = True + break + tag = tag.nextSibling + if not tag: + headline_found = False + break + if headline_found: + cgFirst.insert(insertLoc,firstImg) + else: + self.log(">>> No class:'columnGroup first' found <<<") + # Change class="kicker" to

+ kicker = soup.find(True, {'class':'kicker'}) + if kicker and kicker.contents[0]: + h3Tag = Tag(soup, "h3") + h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker, + use_alt=False))) + kicker.replaceWith(h3Tag) + + # Change captions to italic -1 + for caption in soup.findAll(True, {'class':'caption'}) : + if caption and caption.contents[0]: + emTag = Tag(soup, "em") + c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() + mp_off = c.find("More Photos") + if mp_off >= 0: + c = c[:mp_off] + emTag.insert(0, c) + hrTag = Tag(soup, 'hr') + #hrTag['style'] = "margin-top:0em;margin-bottom:0em" + emTag.insert(1, hrTag) + caption.replaceWith(emTag) + + # Change to

+ h1 = soup.find('h1') + if h1: + headline = h1.find("nyt_headline") + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.contents[0])) + h1.replaceWith(tag) + else: + # Blog entry - replace headline, remove
tags + headline = soup.find('title') + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.contents[0])) + soup.insert(0, tag) + hrs = soup.findAll('hr') + for hr in hrs: + hr.extract() + + # Change

to

- used in editorial blogs + masthead = soup.find("h1") + if masthead: + # Nuke the href + if masthead.a: + del(masthead.a['href']) + tag = Tag(soup, "h3") + tag.insert(0, self.fixChars(masthead.contents[0])) + masthead.replaceWith(tag) + + # Change to + for subhead in soup.findAll(True, {'class':'bold'}) : + if subhead.contents: + bTag = Tag(soup, "b") + bTag.insert(0, subhead.contents[0]) + subhead.replaceWith(bTag) + + # Synthesize a section header + dsk = soup.find('meta', attrs={'name':'dsk'}) + if dsk and dsk.has_key('content'): + hTag = Tag(soup,'h3') + hTag['class'] = 'section' + hTag.insert(0,NavigableString(dsk['content'])) + articleTag = soup.find(True, attrs={'id':'article'}) + if articleTag: + articleTag.insert(0,hTag) + + # Add class="articleBody" to
so we can format with CSS + divTag = soup.find('div',attrs={'id':'articleBody'}) + if divTag: + divTag['class'] = divTag['id'] + + # Add class="authorId" to
so we can format with CSS + divTag = soup.find('div',attrs={'id':'authorId'}) + if divTag and divTag.contents[0]: + tag = Tag(soup, "p") + tag['class'] = "authorId" + tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], + use_alt=False))) + divTag.replaceWith(tag) + + return soup + def strip_anchors(self,soup): paras = soup.findAll(True) for para in paras: @@ -297,94 +520,3 @@ class NYTimes(BasicNewsRecipe): if a.img is None: a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - - def preprocess_html(self, soup): -# refresh = soup.find('meta', {'http-equiv':'refresh'}) -# if refresh is None: -# return self.strip_anchors(soup) -# -# content = refresh.get('content').partition('=')[2] -# raw = self.browser.open('http://www.nytimes.com'+content).read() -# soup = BeautifulSoup(raw.decode('cp1252', 'replace')) - return self.strip_anchors(soup) - refresh = soup.find('meta', {'http-equiv':'refresh'}) - if refresh is not None: - content = refresh.get('content').partition('=')[2] - raw = self.browser.open('http://www.nytimes.com'+content).read() - soup = BeautifulSoup(raw.decode('cp1252', 'replace')) - - soup = self.strip_anchors(soup) - - # Test for empty content - body = soup.find('body') - tagCount = len(body.findAll(True)) - if tagCount: -# print "%d tags in article" % tagCount - return soup - else: - print "no allowed content found, removing article" - raise Exception - - def postprocess_html(self,soup, True): - - # Change class="kicker" to

- kicker = soup.find(True, {'class':'kicker'}) - if kicker is not None : - h3Tag = Tag(soup, "h3") - h3Tag.insert(0, kicker.contents[0]) - kicker.replaceWith(h3Tag) - - # Change captions to italic -1 - for caption in soup.findAll(True, {'class':'caption'}) : - if caption is not None: - emTag = Tag(soup, "em") - emTag.insert(0, caption.contents[0]) - hrTag = Tag(soup, 'hr') - emTag.insert(1, hrTag) - caption.replaceWith(emTag) - - # Change to

- headline = soup.find("nyt_headline") - if headline is not None : - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, headline.contents[0]) - soup.h1.replaceWith(tag) - - # Change

to

- used in editorial blogs - masthead = soup.find("h1") - if masthead is not None : - # Nuke the href - if masthead.a is not None : - del(masthead.a['href']) - tag = Tag(soup, "h3") - tag.insert(0, masthead.contents[0]) - soup.h1.replaceWith(tag) - - # Change to - for subhead in soup.findAll(True, {'class':'bold'}) : - bTag = Tag(soup, "b") - bTag.insert(0, subhead.contents[0]) - subhead.replaceWith(bTag) - - # Synthesize a section header - dsk = soup.find('meta', attrs={'name':'dsk'}) - if dsk is not None and dsk.has_key('content'): - hTag = Tag(soup,'h3') - hTag['class'] = 'section' - hTag.insert(0,NavigableString(dsk['content'])) - articleTag = soup.find(True, attrs={'id':'article'}) - articleTag.insert(0,hTag) - - # Add class="articleBody" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'articleBody'}) - if divTag is not None : - divTag['class'] = divTag['id'] - - # Add class="authorId" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'authorId'}) - if divTag is not None : - divTag['class'] = divTag['id'] - - return soup - diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index 08d658d9bc..9f2efa9f9b 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -20,7 +20,7 @@ class ANDROID(USBMS): VENDOR_ID = { 0x0bb4 : { 0x0c02 : [0x100], 0x0c01 : [0x100]}, 0x22b8 : { 0x41d9 : [0x216]}, - 0x18d1 : { 0x4e11 : [0x0100], 0x4e12: [0x0100]}, + 0x18d1 : { 0x4e11 : [0x0100, 0x226], 0x4e12: [0x0100, 0x226]}, 0x04e8 : { 0x681d : [0x0222]}, } EBOOK_DIR_MAIN = ['wordplayer/calibretransfer', 'eBooks/import', 'Books']