diff --git a/resources/recipes/fudzilla.recipe b/resources/recipes/fudzilla.recipe index 821488ad0a..b47b4d4cab 100644 --- a/resources/recipes/fudzilla.recipe +++ b/resources/recipes/fudzilla.recipe @@ -25,15 +25,15 @@ class Fudzilla(BasicNewsRecipe): remove_tags_before = dict(name='div', attrs={'class':['padding']}) remove_tags = [dict(name='td', attrs={'class':['left','right']}), - dict(name='div', attrs={'id':['toolbar','buttons']}), - dict(name='div', attrs={'class':['artbannersxtd','back_button']}), - dict(name='span', attrs={'class':['pathway']}), - dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}), - dict(name='table', attrs={'class':['headlines']}), + dict(name='div', attrs={'id':['toolbar','buttons']}), + dict(name='div', attrs={'class':['artbannersxtd','back_button']}), + dict(name='span', attrs={'class':['pathway']}), + dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}), + dict(name='table', attrs={'class':['headlines']}), ] feeds = [ - (u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1') + (u'Posts', u'http://www.fudzilla.com/?format=feed') ] preprocess_regexps = [ diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe index a2d5135045..c656450990 100644 --- a/resources/recipes/nytimes.recipe +++ b/resources/recipes/nytimes.recipe @@ -5,62 +5,59 @@ __copyright__ = '2008, Kovid Goyal ' ''' nytimes.com ''' -import re -import time -from calibre import entity_to_unicode +import re, string, time +from calibre import entity_to_unicode, strftime from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \ -Comment, BeautifulStoneSoup +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup class NYTimes(BasicNewsRecipe): - title = 'New York Times Top Stories' - __author__ = 'GRiker' - language = 'en' - requires_version = (0, 7, 5) - description = 'Top Stories from the New York Times' + # set headlinesOnly to True for the headlines-only version + headlinesOnly = True - # List of sections typically included in Top Stories. Use a keyword from the - # right column in the excludeSectionKeywords[] list to skip downloading that section - sections = { - 'arts' : 'Arts', - 'business' : 'Business', - 'diningwine' : 'Dining & Wine', - 'editorials' : 'Editorials', - 'health' : 'Health', - 'magazine' : 'Magazine', - 'mediaadvertising' : 'Media & Advertising', - 'newyorkregion' : 'New York/Region', - 'oped' : 'Op-Ed', - 'politics' : 'Politics', - 'science' : 'Science', - 'sports' : 'Sports', - 'technology' : 'Technology', - 'topstories' : 'Top Stories', - 'travel' : 'Travel', - 'us' : 'U.S.', - 'world' : 'World' - } + # includeSections: List of sections to include. If empty, all sections found will be included. + # Otherwise, only the sections named will be included. For example, + # + # includeSections = ['Politics','Sports'] + # + # would cause only the Politics and Sports sections to be included. - # Add section keywords from the right column above to skip that section - # For example, to skip sections containing the word 'Sports' or 'Dining', use: - # excludeSectionKeywords = ['Sports', 'Dining'] - # Fetch only Business and Technology - # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World'] - # Fetch only Top Stories - # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World'] - # By default, no sections are skipped. - excludeSectionKeywords = [] + includeSections = [] # by default, all sections included + + # excludeSections: List of sections to exclude. If empty, all sections found will be included. + # Otherwise, the sections named will be excluded. For example, + # + # excludeSections = ['Politics','Sports'] + # + # would cause the Politics and Sports sections to be excluded. This parameter can be used + # in conjuction with includeSections although in most cases using one or the other, but + # not both, is sufficient. + + excludeSections = [] # one_picture_per_article specifies that calibre should only use the first image # from an article (if one exists). If one_picture_per_article = True, the image # will be moved to a location between the headline and the byline. # If one_picture_per_article = False, all images from the article will be included + # and shown in their original location. one_picture_per_article = True # The maximum number of articles that will be downloaded - max_articles_per_feed = 40 + max_articles_per_feed = 100 + + + if headlinesOnly: + title='New York Times Headlines' + description = 'Headlines from the New York Times' + else: + title='New York Times' + description = 'Today\'s New York Times' + + __author__ = 'GRiker/Kovid Goyal/Nick Redding' + language = 'en' + requires_version = (0, 7, 5) + timefmt = '' needs_subscription = True @@ -82,6 +79,7 @@ class NYTimes(BasicNewsRecipe): 'entry-response module', 'icon enlargeThis', 'leftNavTabs', + 'metaFootnote', 'module box nav', 'nextArticleLink', 'nextArticleLink clearfix', @@ -89,12 +87,13 @@ class NYTimes(BasicNewsRecipe): 'relatedSearchesModule', 'side_tool', 'singleAd', - 'subNavigation clearfix', - 'subNavigation tabContent active', - 'subNavigation tabContent active clearfix', + re.compile('^subNavigation'), + re.compile('^leaderboard'), + re.compile('^module'), ]}), dict(id=[ 'adxLeaderboard', + 'adxSponLink', 'archive', 'articleExtras', 'articleInline', @@ -105,87 +104,98 @@ class NYTimes(BasicNewsRecipe): 'footer', 'header', 'header_search', + 'inlineBox', 'login', 'masthead', 'masthead-nav', 'memberTools', 'navigation', 'portfolioInline', + 'readerReviews', + 'readerReviewsCount', 'relatedArticles', + 'relatedTopics', 'respond', 'side_search', 'side_index', 'side_tool', 'toolsRight', ]), - dict(name=['script', 'noscript', 'style'])] - + dict(name=['script', 'noscript', 'style','form','hr'])] no_stylesheets = True - extra_css = '.headline {text-align: left;}\n \ - .byline {font-family: monospace; \ - text-align: left; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .dateline {font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .timestamp {font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .source {text-align: left;}\n \ - .image {text-align: center;}\n \ - .credit {text-align: right; \ - font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .articleBody {text-align: left;}\n \ - .authorId {text-align: left; \ - font-style: italic;}\n ' + extra_css = ''' + .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } + .credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } + .dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .timestamp { text-align: left; font-size: small; } + .caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + a:link {text-decoration: none; } + .articleBody { } + .authorId {text-align: left; } + .image {text-align: center;} + .source {text-align: left; }''' - def dump_ans(self, ans) : + def filter_ans(self, ans) : total_article_count = 0 - for section in ans : + idx = 0 + idx_max = len(ans)-1 + while idx <= idx_max: + if self.includeSections != []: + if ans[idx][0] not in self.includeSections: + print "SECTION NOT INCLUDED: ",ans[idx][0] + del ans[idx] + idx_max = idx_max-1 + continue + if ans[idx][0] in self.excludeSections: + print "SECTION EXCLUDED: ",ans[idx][0] + del ans[idx] + idx_max = idx_max-1 + continue if self.verbose: - self.log("section %s: %d articles" % (section[0], len(section[1])) ) - for article in section[1]: + self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) ) + for article in ans[idx][1]: total_article_count += 1 if self.verbose: self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), article['url'].encode('cp1252','replace'))) + idx = idx+1 + self.log( "Queued %d articles" % total_article_count ) + return ans def fixChars(self,string): # Replace lsquo (\x91) - fixed = re.sub("\x91","‘",string) + fixed = re.sub("\x91","‘",string) # Replace rsquo (\x92) - fixed = re.sub("\x92","’",fixed) + fixed = re.sub("\x92","’",fixed) # Replace ldquo (\x93) - fixed = re.sub("\x93","“",fixed) + fixed = re.sub("\x93","“",fixed) # Replace rdquo (\x94) - fixed = re.sub("\x94","”",fixed) + fixed = re.sub("\x94","”",fixed) # Replace ndash (\x96) - fixed = re.sub("\x96","–",fixed) + fixed = re.sub("\x96","–",fixed) # Replace mdash (\x97) - fixed = re.sub("\x97","—",fixed) + fixed = re.sub("\x97","—",fixed) return fixed def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: - try: - br.open('http://www.nytimes.com/auth/login') - br.select_form(name='login') - br['USERID'] = self.username - br['PASSWORD'] = self.password - br.submit() - except: - self.log("\nFailed to login") + br.open('http://www.nytimes.com/auth/login') + br.select_form(name='login') + br['USERID'] = self.username + br['PASSWORD'] = self.password + raw = br.submit().read() + if 'Please try again' in raw: + raise Exception('Your username and password are incorrect') return br def skip_ad_pages(self, soup): @@ -213,6 +223,9 @@ class NYTimes(BasicNewsRecipe): cover = None return cover + def short_title(self): + return self.title + def index_to_soup(self, url_or_raw, raw=False): ''' OVERRIDE of class method @@ -255,157 +268,184 @@ class NYTimes(BasicNewsRecipe): # Kindle TOC descriptions won't render certain characters if description: massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&","&", massaged) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) return self.fixChars(massaged) else: return description - def parse_index(self): + def parse_todays_index(self): + + def feed_title(div): + return ''.join(div.findAll(text=True, recursive=True)).strip() + + articles = {} + key = None + ans = [] + url_list = [] + + def handle_article(div): + a = div.find('a', href=True) + if not a: + return + url = re.sub(r'\?.*', '', a['href']) + if not url.startswith("http"): + return + if not url.endswith(".html"): + return + if 'podcast' in url: + return + if '/video/' in url: + return + url += '?pagewanted=all' + if url in url_list: + return + url_list.append(url) + title = self.tag_to_string(a, use_alt=True).strip() + description = '' + pubdate = strftime('%a, %d %b') + summary = div.find(True, attrs={'class':'summary'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + author = '' + authorAttribution = div.find(True, attrs={'class':'byline'}) + if authorAttribution: + author = self.tag_to_string(authorAttribution, use_alt=False) + else: + authorAttribution = div.find(True, attrs={'class':'byline'}) + if authorAttribution: + author = self.tag_to_string(authorAttribution, use_alt=False) + feed = key if key is not None else 'Uncategorized' + if not articles.has_key(feed): + ans.append(feed) + articles[feed] = [] + articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, author=author, + content='')) + + + soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') + + + # Find each article + for div in soup.findAll(True, + attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): + + if div['class'] in ['section-headline','sectionHeader']: + key = string.capwords(feed_title(div)) + key = key.replace('Op-ed','Op-Ed') + key = key.replace('U.s.','U.S.') + elif div['class'] in ['story', 'story headline'] : + handle_article(div) + elif div['class'] == 'headlinesOnly multiline flush': + for lidiv in div.findAll('li'): + handle_article(lidiv) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return self.filter_ans(ans) + + def parse_headline_index(self): + articles = {} ans = [] - - feed = key = 'All Top Stories' - articles[key] = [] - ans.append(key) - self.log("Scanning 1 section ...") + url_list = [] soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') - # Fetch the outer table - table = soup.find('table') - previousTable = table + # Fetch the content table + content_table = soup.find('table',{'id':'content'}) + if content_table is None: + self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE") + return None - # Find the deepest table containing the stories - while True : - table = table.find('table') - if table.find(text=re.compile('top stories start')) : - previousTable = table - continue - else : - table = previousTable - break + # Within this table are entries, each containing one or more h6 tags which represent sections - # There are multiple subtables, find the one containing the stories - for block in table.findAll('table') : - if block.find(text=re.compile('top stories start')) : - table = block - break - else : - continue + for td_col in content_table.findAll('td', {'id' : re.compile('Column')}): + for div_sec in td_col.findAll('div',recursive=False): + for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}): + section_name = self.tag_to_string(h6_sec_name,use_alt=False) + section_name = re.sub(r'^ *$','',section_name) + if section_name == '': + continue + section_name=string.capwords(section_name) + if section_name == 'U.s.': + section_name = 'U.S.' + elif section_name == 'Op-ed': + section_name = 'Op-Ed' + pubdate = strftime('%a, %d %b') - # Again there are multiple subtables, find the one containing the stories - for storyblock in table.findAll('table') : - if storyblock.find(text=re.compile('top stories start')) : - break - else : - continue - - skipThisSection = False - todays_article_count = 0 - # Within this table are entries - self.log("Fetching feed Top Stories") - for tr in storyblock.findAllNext('tr'): - if tr.find('span') is not None : - - sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif', - 'times new roman,times, sans serif', - 'times new roman, times, sans serif']}) - section = None - bylines = [] - descriptions = [] - pubdate = None - - # Get the Section title - for (x,i) in enumerate(sectionblock.contents) : - skipThisSection = False - # Extract the section title - if ('Comment' in str(i.__class__)) : - if 'start(name=' in i : - section = i[i.find('=')+1:-2] - - if not self.sections.has_key(section) : - skipThisSection = True + search_div = div_sec + for next_tag in h6_sec_name.findNextSiblings(True): + if next_tag.__class__.__name__ == 'Tag': + if next_tag.name == 'div': + search_div = next_tag break - # Check for excluded section - if len(self.excludeSectionKeywords): - key = self.sections[section] - excluded = re.compile('|'.join(self.excludeSectionKeywords)) - if excluded.search(key) or articles.has_key(key): - skipThisSection = True - break - - # Get the bylines and descriptions - if not skipThisSection : - lines = sectionblock.contents - contentStrings = [] - - for line in lines: - if not isinstance(line, Comment) and line.strip and line.strip() > "": - contentStrings.append(line.strip()) - - # Gather the byline/description pairs - bylines = [] - descriptions = [] - for contentString in contentStrings: - if contentString[0:3] == 'By ' and contentString[3].isupper() : - bylines.append(contentString) + # Get the articles + for h3_item in search_div.findAll('h3'): + byline = h3_item.h6 + if byline is not None: + author = self.tag_to_string(byline,usa_alt=False) else: - descriptions.append(contentString) - - # Fetch the article titles and URLs - articleCount = len(sectionblock.findAll('span')) - todays_article_count += articleCount - for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) : - a = span.find('a', href=True) + author = '' + a = h3_item.find('a', href=True) + if not a: + continue url = re.sub(r'\?.*', '', a['href']) + if not url.startswith("http"): + continue + if not url.endswith(".html"): + continue + if 'podcast' in url: + continue + if 'video' in url: + continue url += '?pagewanted=all' + if url in url_list: + continue + url_list.append(url) + self.log("URL %s" % url) + title = self.tag_to_string(a, use_alt=True).strip() + desc = h3_item.find('p') + if desc is not None: + description = self.tag_to_string(desc,use_alt=False) + else: + description = '' + if not articles.has_key(section_name): + ans.append(section_name) + articles[section_name] = [] + articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) - title = self.tag_to_string(a, use_alt=True) - # prepend the section name - title = self.sections[section] + " · " + title - if not isinstance(title, unicode): - title = title.decode('utf-8', 'replace') - - # Allow for unattributed, undescribed entries "Editor's Note" - if i >= len(descriptions) : - description = None - else : - description = descriptions[i] - - if len(bylines) == articleCount : - author = bylines[i] - else : - author = None - - # Check for duplicates - duplicateFound = False - if len(articles[feed]) > 1: - for article in articles[feed] : - if url == article['url'] : - duplicateFound = True - break - - if duplicateFound: - # Continue fetching, don't add this article - todays_article_count -= 1 - continue - - if not articles.has_key(feed): - articles[feed] = [] - articles[feed].append( - dict(title=title, url=url, date=pubdate, - description=description, author=author, content='')) -# self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories")) - - ans = self.sort_index_by(ans, {'Top Stories':-1}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - self.dump_ans(ans) - return ans + return self.filter_ans(ans) + + def parse_index(self): + if self.headlinesOnly: + return self.parse_headline_index() + else: + return self.parse_todays_index() + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup + def preprocess_html(self, soup): + + kicker_tag = soup.find(attrs={'class':'kicker'}) + if kicker_tag: # remove Op_Ed author head shots + tagline = self.tag_to_string(kicker_tag) + if tagline=='Op-Ed Columnist': + img_div = soup.find('div','inlineImage module') + if img_div: + img_div.extract() return self.strip_anchors(soup) def postprocess_html(self,soup, True): @@ -422,8 +462,9 @@ class NYTimes(BasicNewsRecipe): firstImg = inlineImgs[0] for inlineImg in inlineImgs[1:]: inlineImg.extract() - # Move firstImg after headline - cgFirst = soup.find(True, {'class':'columnGroup first'}) + # Move firstImg before article body + #article_body = soup.find(True, {'id':'articleBody'}) + cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')}) if cgFirst: # Strip all sibling NavigableStrings: noise navstrings = cgFirst.findAll(text=True, recursive=False) @@ -443,30 +484,18 @@ class NYTimes(BasicNewsRecipe): if headline_found: cgFirst.insert(insertLoc,firstImg) else: - self.log(">>> No class:'columnGroup first' found <<<") - # Change class="kicker" to

- kicker = soup.find(True, {'class':'kicker'}) - if kicker and kicker.contents[0]: - h3Tag = Tag(soup, "h3") - h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker, - use_alt=False))) - kicker.replaceWith(h3Tag) + self.log(">>> No class:'columnGroup first' found <<<") - # Change captions to italic -1 + # Change captions to italic for caption in soup.findAll(True, {'class':'caption'}) : if caption and caption.contents[0]: - emTag = Tag(soup, "em") + cTag = Tag(soup, "p", [("class", "caption")]) c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() mp_off = c.find("More Photos") if mp_off >= 0: c = c[:mp_off] - emTag.insert(0, c) - #hrTag = Tag(soup, 'hr') - #hrTag['class'] = 'caption_divider' - hrTag = Tag(soup, 'div') - hrTag['class'] = 'divider' - emTag.insert(1, hrTag) - caption.replaceWith(emTag) + cTag.insert(0, c) + caption.replaceWith(cTag) # Change to

h1 = soup.find('h1') @@ -506,17 +535,6 @@ class NYTimes(BasicNewsRecipe): bTag.insert(0, subhead.contents[0]) subhead.replaceWith(bTag) - # Synthesize a section header - dsk = soup.find('meta', attrs={'name':'dsk'}) - if dsk and dsk.has_key('content'): - hTag = Tag(soup,'h3') - hTag['class'] = 'section' - hTag.insert(0,NavigableString(dsk['content'])) - articleTag = soup.find(True, attrs={'id':'article'}) - if articleTag: - articleTag.insert(0,hTag) - - # Add class="articleBody" to
so we can format with CSS divTag = soup.find('div',attrs={'id':'articleBody'}) if divTag: divTag['class'] = divTag['id'] @@ -532,11 +550,3 @@ class NYTimes(BasicNewsRecipe): return soup - def strip_anchors(self,soup): - paras = soup.findAll(True) - for para in paras: - aTags = para.findAll('a') - for a in aTags: - if a.img is None: - a.replaceWith(a.renderContents().decode('cp1252','replace')) - return soup diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index 5452ae1c6e..ed1ba75f0f 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -5,52 +5,186 @@ __copyright__ = '2008, Kovid Goyal ' ''' nytimes.com ''' -import string, re, time -from calibre import strftime +import re, string, time +from calibre import entity_to_unicode, strftime from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup - -def decode(self, src): - enc = 'utf-8' - if 'iso-8859-1' in src: - enc = 'cp1252' - return src.decode(enc, 'ignore') +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup class NYTimes(BasicNewsRecipe): - title = u'New York Times' - __author__ = 'Kovid Goyal/Nick Redding' - language = 'en' - requires_version = (0, 6, 36) + # set headlinesOnly to True for the headlines-only version + headlinesOnly = False - description = 'Daily news from the New York Times (subscription version)' - timefmt = ' [%b %d]' + # includeSections: List of sections to include. If empty, all sections found will be included. + # Otherwise, only the sections named will be included. For example, + # + # includeSections = ['Politics','Sports'] + # + # would cause only the Politics and Sports sections to be included. + + includeSections = [] # by default, all sections included + + # excludeSections: List of sections to exclude. If empty, all sections found will be included. + # Otherwise, the sections named will be excluded. For example, + # + # excludeSections = ['Politics','Sports'] + # + # would cause the Politics and Sports sections to be excluded. This parameter can be used + # in conjuction with includeSections although in most cases using one or the other, but + # not both, is sufficient. + + excludeSections = [] + + # one_picture_per_article specifies that calibre should only use the first image + # from an article (if one exists). If one_picture_per_article = True, the image + # will be moved to a location between the headline and the byline. + # If one_picture_per_article = False, all images from the article will be included + + # and shown in their original location. + one_picture_per_article = True + + # The maximum number of articles that will be downloaded + max_articles_per_feed = 100 + + + if headlinesOnly: + title='New York Times Headlines' + description = 'Headlines from the New York Times' + else: + title='New York Times' + description = 'Today\'s New York Times' + + __author__ = 'GRiker/Kovid Goyal/Nick Redding' + language = 'en' + requires_version = (0, 7, 5) + + + timefmt = '' needs_subscription = True + masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' + cover_margins = (18,18,'grey99') + remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') - remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink', - 'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta', - 'icon enlargeThis','columnGroup last','relatedSearchesModule']}), - dict({'class':re.compile('^subNavigation')}), - dict({'class':re.compile('^leaderboard')}), - dict({'class':re.compile('^module')}), - dict({'class':'metaFootnote'}), - dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead', - 'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline', - 'side_tool', 'side_index','header','readerReviewsCount','readerReviews', - 'relatedArticles', 'relatedTopics', 'adxSponLink']), + remove_tags = [dict(attrs={'class':[ + 'articleFooter', + 'articleTools', + 'columnGroup doubleRule', + 'columnGroup singleRule', + 'columnGroup last', + 'columnGroup last', + 'doubleRule', + 'dottedLine', + 'entry-meta', + 'entry-response module', + 'icon enlargeThis', + 'leftNavTabs', + 'metaFootnote', + 'module box nav', + 'nextArticleLink', + 'nextArticleLink clearfix', + 'post-tools', + 'relatedSearchesModule', + 'side_tool', + 'singleAd', + re.compile('^subNavigation'), + re.compile('^leaderboard'), + re.compile('^module'), + ]}), + dict(id=[ + 'adxLeaderboard', + 'adxSponLink', + 'archive', + 'articleExtras', + 'articleInline', + 'blog_sidebar', + 'businessSearchBar', + 'cCol', + 'entertainmentSearchBar', + 'footer', + 'header', + 'header_search', + 'inlineBox', + 'login', + 'masthead', + 'masthead-nav', + 'memberTools', + 'navigation', + 'portfolioInline', + 'readerReviews', + 'readerReviewsCount', + 'relatedArticles', + 'relatedTopics', + 'respond', + 'side_search', + 'side_index', + 'side_tool', + 'toolsRight', + ]), dict(name=['script', 'noscript', 'style','form','hr'])] - encoding = decode no_stylesheets = True extra_css = ''' - .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; } - .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } - .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } + .credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } + .dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .timestamp { font-size: small; } - .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - a:link {text-decoration: none; }''' + .timestamp { text-align: left; font-size: small; } + .caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + a:link {text-decoration: none; } + .articleBody { } + .authorId {text-align: left; } + .image {text-align: center;} + .source {text-align: left; }''' + + def filter_ans(self, ans) : + total_article_count = 0 + idx = 0 + idx_max = len(ans)-1 + while idx <= idx_max: + if self.includeSections != []: + if ans[idx][0] not in self.includeSections: + print "SECTION NOT INCLUDED: ",ans[idx][0] + del ans[idx] + idx_max = idx_max-1 + continue + if ans[idx][0] in self.excludeSections: + print "SECTION EXCLUDED: ",ans[idx][0] + del ans[idx] + idx_max = idx_max-1 + continue + if self.verbose: + self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) ) + for article in ans[idx][1]: + total_article_count += 1 + if self.verbose: + self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), + article['url'].encode('cp1252','replace'))) + idx = idx+1 + + self.log( "Queued %d articles" % total_article_count ) + return ans + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + + return fixed def get_browser(self): br = BasicNewsRecipe.get_browser() @@ -60,22 +194,19 @@ class NYTimes(BasicNewsRecipe): br['USERID'] = self.username br['PASSWORD'] = self.password raw = br.submit().read() - if 'Sorry, we could not find the combination you entered. Please try again.' in raw: + if 'Please try again' in raw: raise Exception('Your username and password are incorrect') - #open('/t/log.html', 'wb').write(raw) return br - def get_masthead_url(self): - masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - #masthead = 'http://members.cox.net/nickredding/nytlogo.gif' - br = BasicNewsRecipe.get_browser() - try: - br.open(masthead) - except: - self.log("\nMasthead unavailable") - masthead = None - return masthead - + def skip_ad_pages(self, soup): + # Skip ad pages served before actual article + skip_tag = soup.find(True, {'name':'skip'}) + if skip_tag is not None: + self.log.warn("Found forwarding link: %s" % skip_tag.parent['href']) + url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) + url += '?pagewanted=all' + self.log.warn("Skipping ad to article at '%s'" % url) + return self.index_to_soup(url, raw=True) def get_cover_url(self): cover = None @@ -93,12 +224,57 @@ class NYTimes(BasicNewsRecipe): return cover def short_title(self): - return 'New York Times' + return self.title - def parse_index(self): - self.encoding = 'cp1252' - soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') - self.encoding = decode + def index_to_soup(self, url_or_raw, raw=False): + ''' + OVERRIDE of class method + deals with various page encodings between index and articles + ''' + def get_the_soup(docEncoding, url_or_raw, raw=False) : + if re.match(r'\w+://', url_or_raw): + f = self.browser.open(url_or_raw) + _raw = f.read() + f.close() + if not _raw: + raise RuntimeError('Could not fetch index from %s'%url_or_raw) + else: + _raw = url_or_raw + if raw: + return _raw + + if not isinstance(_raw, unicode) and self.encoding: + _raw = _raw.decode(docEncoding, 'replace') + massage = list(BeautifulSoup.MARKUP_MASSAGE) + massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding))) + return BeautifulSoup(_raw, markupMassage=massage) + + # Entry point + print "index_to_soup()" + soup = get_the_soup( self.encoding, url_or_raw ) + contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) + docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] + if docEncoding == '' : + docEncoding = self.encoding + + if self.verbose > 2: + self.log( " document encoding: '%s'" % docEncoding) + if docEncoding != self.encoding : + soup = get_the_soup(docEncoding, url_or_raw) + + return soup + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def parse_todays_index(self): def feed_title(div): return ''.join(div.findAll(text=True, recursive=True)).strip() @@ -119,12 +295,13 @@ class NYTimes(BasicNewsRecipe): return if 'podcast' in url: return + if '/video/' in url: + return url += '?pagewanted=all' if url in url_list: return url_list.append(url) title = self.tag_to_string(a, use_alt=True).strip() - #self.log("Title: %s" % title) description = '' pubdate = strftime('%a, %d %b') summary = div.find(True, attrs={'class':'summary'}) @@ -140,6 +317,7 @@ class NYTimes(BasicNewsRecipe): author = self.tag_to_string(authorAttribution, use_alt=False) feed = key if key is not None else 'Uncategorized' if not articles.has_key(feed): + ans.append(feed) articles[feed] = [] articles[feed].append( dict(title=title, url=url, date=pubdate, @@ -147,46 +325,228 @@ class NYTimes(BasicNewsRecipe): content='')) + soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') - # Find each instance of class="section-headline", class="story", class="story headline" + + # Find each article for div in soup.findAll(True, attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): if div['class'] in ['section-headline','sectionHeader']: key = string.capwords(feed_title(div)) - articles[key] = [] - ans.append(key) - #self.log('Section: %s' % key) - + key = key.replace('Op-ed','Op-Ed') + key = key.replace('U.s.','U.S.') elif div['class'] in ['story', 'story headline'] : handle_article(div) elif div['class'] == 'headlinesOnly multiline flush': for lidiv in div.findAll('li'): handle_article(lidiv) -# ans = self.sort_index_by(ans, {'The Front Page':-1, -# 'Dining In, Dining Out':1, -# 'Obituaries':2}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return self.filter_ans(ans) + + def parse_headline_index(self): + + articles = {} + ans = [] + url_list = [] + + soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') + + # Fetch the content table + content_table = soup.find('table',{'id':'content'}) + if content_table is None: + self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE") + return None + + # Within this table are entries, each containing one or more h6 tags which represent sections + + for td_col in content_table.findAll('td', {'id' : re.compile('Column')}): + for div_sec in td_col.findAll('div',recursive=False): + for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}): + section_name = self.tag_to_string(h6_sec_name,use_alt=False) + section_name = re.sub(r'^ *$','',section_name) + if section_name == '': + continue + section_name=string.capwords(section_name) + if section_name == 'U.s.': + section_name = 'U.S.' + elif section_name == 'Op-ed': + section_name = 'Op-Ed' + pubdate = strftime('%a, %d %b') + + search_div = div_sec + for next_tag in h6_sec_name.findNextSiblings(True): + if next_tag.__class__.__name__ == 'Tag': + if next_tag.name == 'div': + search_div = next_tag + break + + # Get the articles + for h3_item in search_div.findAll('h3'): + byline = h3_item.h6 + if byline is not None: + author = self.tag_to_string(byline,usa_alt=False) + else: + author = '' + a = h3_item.find('a', href=True) + if not a: + continue + url = re.sub(r'\?.*', '', a['href']) + if not url.startswith("http"): + continue + if not url.endswith(".html"): + continue + if 'podcast' in url: + continue + if 'video' in url: + continue + url += '?pagewanted=all' + if url in url_list: + continue + url_list.append(url) + self.log("URL %s" % url) + title = self.tag_to_string(a, use_alt=True).strip() + desc = h3_item.find('p') + if desc is not None: + description = self.tag_to_string(desc,use_alt=False) + else: + description = '' + if not articles.has_key(section_name): + ans.append(section_name) + articles[section_name] = [] + articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) + + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return self.filter_ans(ans) + + def parse_index(self): + if self.headlinesOnly: + return self.parse_headline_index() + else: + return self.parse_todays_index() + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup - return ans def preprocess_html(self, soup): + kicker_tag = soup.find(attrs={'class':'kicker'}) - if kicker_tag: + if kicker_tag: # remove Op_Ed author head shots tagline = self.tag_to_string(kicker_tag) - #self.log("FOUND KICKER %s" % tagline) if tagline=='Op-Ed Columnist': img_div = soup.find('div','inlineImage module') - #self.log("Searching for photo") if img_div: img_div.extract() - #self.log("Photo deleted") - refresh = soup.find('meta', {'http-equiv':'refresh'}) - if refresh is None: - return soup - content = refresh.get('content').partition('=')[2] - raw = self.browser.open_novisit('http://www.nytimes.com'+content).read() - return BeautifulSoup(raw.decode('cp1252', 'replace')) + return self.strip_anchors(soup) + def postprocess_html(self,soup, True): + + if self.one_picture_per_article: + # Remove all images after first + largeImg = soup.find(True, {'class':'articleSpanImage'}) + inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) + if largeImg: + for inlineImg in inlineImgs: + inlineImg.extract() + else: + if inlineImgs: + firstImg = inlineImgs[0] + for inlineImg in inlineImgs[1:]: + inlineImg.extract() + # Move firstImg before article body + #article_body = soup.find(True, {'id':'articleBody'}) + cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')}) + if cgFirst: + # Strip all sibling NavigableStrings: noise + navstrings = cgFirst.findAll(text=True, recursive=False) + [ns.extract() for ns in navstrings] + headline_found = False + tag = cgFirst.find(True) + insertLoc = 0 + while True: + insertLoc += 1 + if hasattr(tag,'class') and tag['class'] == 'articleHeadline': + headline_found = True + break + tag = tag.nextSibling + if not tag: + headline_found = False + break + if headline_found: + cgFirst.insert(insertLoc,firstImg) + else: + self.log(">>> No class:'columnGroup first' found <<<") + + # Change captions to italic + for caption in soup.findAll(True, {'class':'caption'}) : + if caption and caption.contents[0]: + cTag = Tag(soup, "p", [("class", "caption")]) + c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() + mp_off = c.find("More Photos") + if mp_off >= 0: + c = c[:mp_off] + cTag.insert(0, c) + caption.replaceWith(cTag) + + # Change to

+ h1 = soup.find('h1') + if h1: + headline = h1.find("nyt_headline") + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.contents[0])) + h1.replaceWith(tag) + else: + # Blog entry - replace headline, remove
tags + headline = soup.find('title') + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.contents[0])) + soup.insert(0, tag) + hrs = soup.findAll('hr') + for hr in hrs: + hr.extract() + + # Change

to

- used in editorial blogs + masthead = soup.find("h1") + if masthead: + # Nuke the href + if masthead.a: + del(masthead.a['href']) + tag = Tag(soup, "h3") + tag.insert(0, self.fixChars(masthead.contents[0])) + masthead.replaceWith(tag) + + # Change to + for subhead in soup.findAll(True, {'class':'bold'}) : + if subhead.contents: + bTag = Tag(soup, "b") + bTag.insert(0, subhead.contents[0]) + subhead.replaceWith(bTag) + + divTag = soup.find('div',attrs={'id':'articleBody'}) + if divTag: + divTag['class'] = divTag['id'] + + # Add class="authorId" to
so we can format with CSS + divTag = soup.find('div',attrs={'id':'authorId'}) + if divTag and divTag.contents[0]: + tag = Tag(soup, "p") + tag['class'] = "authorId" + tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], + use_alt=False))) + divTag.replaceWith(tag) + + return soup diff --git a/resources/recipes/zeitde.recipe b/resources/recipes/zeitde.recipe index 7f2ca0f6b2..35835e0e6d 100644 --- a/resources/recipes/zeitde.recipe +++ b/resources/recipes/zeitde.recipe @@ -6,22 +6,25 @@ Fetch Die Zeit. ''' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag class ZeitDe(BasicNewsRecipe): - title = 'ZEIT Online' - description = 'ZEIT Online' + title = 'Zeit Online' + description = 'Zeit Online' language = 'de' - lang = 'de_DE' - __author__ = 'Martin Pitt, Sujata Raman and Ingo Paschke' - use_embedded_content = False + __author__ = 'Martin Pitt, Sujata Raman, Ingo Paschke and Marc Toensing' + max_articles_per_feed = 40 - remove_empty_feeds = True - no_stylesheets = True - no_javascript = True - encoding = 'utf-8' + + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':["response","pagination block","pagenav","inline link", "copyright"] }), + dict(name='p', attrs={'class':["ressortbacklink", "copyright"] }), + dict(name='div', attrs={'id':["place_5","place_4","comments"]}) + ] + + keep_only_tags = [dict(id=['main'])] feeds = [ ('Seite 1', 'http://newsfeed.zeit.de/index_xml'), @@ -40,43 +43,15 @@ class ZeitDe(BasicNewsRecipe): ('Sport', 'http://newsfeed.zeit.de/sport/index'), ] - extra_css = ''' - .supertitle{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} - .excerpt{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:small;} - .title{font-family:Arial,Helvetica,sans-serif;font-size:large;clear:right;} - .caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} - .copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} - .article{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small} - .quote{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small} - .quote .cite{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small} - .headline iconportrait_inline{font-family:Arial,Helvetica,sans-serif;font-size:x-small} - .inline{float:left;margin-top:0;margin-right:15px;position:relative;width:180px; } - img.inline{float:none} - .intertitle{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small;font-weight:700} - .ebinfobox{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small;list-style-type:none;float:right;margin-top:0;border-left-style:solid;border-left-width:1px;padding-left:10px;} - .infobox {border-style: solid; border-width: 1px;padding:8px;} - .infobox dt {font-weight:700;} - ''' + extra_css = '.reaktion,.taglist,.comments,.reponse,.responsetitle,.responsebody,.reponse,.inline,.date{display:none;}li.date{display:block}' + #filter_regexps = [r'ad.de.doubleclick.net/'] - keep_only_tags = [ - dict(name='div', attrs={'class':["article"]}) , - dict(name='ul', attrs={'class':["tools"]}) , - ] - remove_tags = [ - dict(name='link'), dict(name='iframe'),dict(name='style'),dict(name='meta'), - dict(name='div', attrs={'class':["pagination block","pagenav","inline link", "copyright"] }), - dict(name='p', attrs={'class':["ressortbacklink", "copyright"] }), - dict(name='div', attrs={'id':["place_5","place_4","comments"]}) - ] - - remove_attributes = ['style', 'font'] - def get_article_url(self, article): ans = article.get('link',None) - ans += "?page=all" + ans += "?page=all&print=true" - if 'video' in ans or 'quiz' in ans : + if 'video' in ans or 'quiz' in ans or 'blog' in ans : ans = None return ans @@ -86,25 +61,3 @@ class ZeitDe(BasicNewsRecipe): return inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','') except: return 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg' - - def preprocess_html(self, soup): - soup.html['xml:lang'] = self.lang - soup.html['lang'] = self.lang - mtag = '' - soup.head.insert(0,mtag) - title = soup.find('h2', attrs={'class':'title'}) - if title is None: - print "no title" - return soup - info = Tag(soup,'ul',[('class','ebinfobox')]) - tools = soup.find('ul', attrs={'class':'tools'}) - #author = tools.find('li','author first') - for tag in ['author first', 'date', 'date first', 'author', 'source']: - line = tools.find('li', tag) - if line: - info.insert(0,line) - title.parent.insert(0,info) - tools.extract() - return soup - - diff --git a/resources/templates/html_export_default.css b/resources/templates/html_export_default.css new file mode 100644 index 0000000000..79787febfa --- /dev/null +++ b/resources/templates/html_export_default.css @@ -0,0 +1,60 @@ +body{ + margin:0px; + padding: 0.5em; + background-color:#F6F3E9; + font-size:12px; + font-family:Arial, Helvetica, sans-serif; +} + +.calibreMeta{ + background-color:#39322B; + color:white; + padding:10px; +} + +.calibreMeta a, .calibreEbNav a, .calibreEbNavTop a, .calibreToc a{ + color:white; +} + +.calibreMeta h1{ + margin:0px; + font-size:18px; + background-color:#39322B; +} + +.calibreEbookContent{ + padding:20px; +} + +.calibreEbNav, .calibreEbNavTop{ + clear:both; + background-color:#39322B; + color:white; + padding:10px; + text-align:center; +} + +.calibreEbNavTop{ + margin-bottom:20px; +} + +.calibreEbNav a, .calibreEbNavTop a{ + padding:0px 5px; +} + +.calibreTocIndex{ + line-height:18px; +} + +.calibreToc{ + float:left; + margin:20px; + width:300px; + background-color:#39322B; + color:white; + padding:10px; +} +.calibreEbookContent{ + width:600px; + float:left; +} diff --git a/resources/templates/html_export_default.tmpl b/resources/templates/html_export_default.tmpl new file mode 100644 index 0000000000..c3ed921255 --- /dev/null +++ b/resources/templates/html_export_default.tmpl @@ -0,0 +1,74 @@ + + + +${head_content}$ + + + + + + +
+
+ ${pos1=1}$ + ${for title in meta.titles():}$ + ${if pos1:}$ +

+ ${print title}$ +

+ ${:else:}$ +
${print title}$
+ ${:endif}$ + ${pos1=0}$ + ${:endfor}$ +
+
+ ${print ', '.join(meta.creators())}$ +
+
+ +
+ +
+ ${if prevLink or nextLink:}$ +
+ ${if prevLink:}$ + ${print _('previous page'),}$ + ${:else:}$ + ${print _('previous page'),}$ + ${:endif}$ + + ${if nextLink:}$ + ${print _('next page'),}$ + ${:endif}$ +
+ ${:endif}$ + + ${ebookContent}$ +
+ + ${if has_toc:}$ +
+

${print _('Table of contents'),}$

+ ${print toc()}$ +
+ ${:endif}$ + +
+ ${if prevLink:}$ + ${print _('previous page'),}$ + ${:else:}$ + ${print _('previous page'),}$ + ${:endif}$ + + ${print _('start'),}$ + + ${if nextLink:}$ + ${print _('next page'),}$ + ${:endif}$ +
+ +
+ + + diff --git a/resources/templates/html_export_default_index.tmpl b/resources/templates/html_export_default_index.tmpl new file mode 100644 index 0000000000..4a9e8ab6f3 --- /dev/null +++ b/resources/templates/html_export_default_index.tmpl @@ -0,0 +1,61 @@ + + + + + + + + +${print ', '.join(meta.creators()),}$ - ${print meta.titles().next(); meta.titles().close()}$ + +${for item in meta:}$ + +${:endfor}$ + + + + + +
+
+ ${pos1=1}$ + ${for title in meta.titles():}$ + ${if pos1:}$ +

+ ${print title}$ +

+ ${:else:}$ +
${print title}$
+ ${:endif}$ + ${pos1=0}$ + ${:endfor}$ +
+
+ ${print ', '.join(meta.creators()),}$ +
+
+ +
+
+ + ${if has_toc:}$ +
+

${print _('Table of contents'),}$

+ ${toc}$ +
+ ${:else:}$ +

${print _('No table of contents present'),}$

+ + ${:endif}$ + +
+ +
+ ${if nextLink:}$ + ${print _('next page'),}$ + ${:endif}$ +
+
+ + + diff --git a/setup/server.py b/setup/server.py index 0fea4ec733..5fabd17108 100644 --- a/setup/server.py +++ b/setup/server.py @@ -89,7 +89,7 @@ class Server(Command): t = telnetlib.Telnet('localhost', 4242) t.read_until("repl>") t.write('BrowserReload();') - print t.read_until("repl>") + t.read_until("repl>") t.close() except: print 'Failed to reload browser' diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 3cc84f248d..d64369e363 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -446,6 +446,7 @@ from calibre.ebooks.rb.output import RBOutput from calibre.ebooks.rtf.output import RTFOutput from calibre.ebooks.tcr.output import TCROutput from calibre.ebooks.txt.output import TXTOutput +from calibre.ebooks.html.output import HTMLOutput from calibre.ebooks.snb.output import SNBOutput from calibre.customize.profiles import input_profiles, output_profiles @@ -525,6 +526,7 @@ plugins += [ RTFOutput, TCROutput, TXTOutput, + HTMLOutput, SNBOutput, ] # Order here matters. The first matched device is the one used. @@ -893,4 +895,3 @@ plugins += [LookAndFeel, Behavior, Columns, Toolbar, InputOptions, Email, Server, Plugins, Tweaks, Misc] #}}} - diff --git a/src/calibre/ebooks/html/meta.py b/src/calibre/ebooks/html/meta.py new file mode 100644 index 0000000000..9a088efb16 --- /dev/null +++ b/src/calibre/ebooks/html/meta.py @@ -0,0 +1,33 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, Fabian Grassl ' +__docformat__ = 'restructuredtext en' + + +from calibre.ebooks.oeb.base import namespace, barename, DC11_NS + +class EasyMeta(object): + + def __init__(self, meta): + self.meta = meta + + def __iter__(self): + meta = self.meta + for item_name in meta.items: + for item in meta[item_name]: + if namespace(item.term) == DC11_NS: + yield { 'name': barename(item.term), 'value': item.value } + + def __len__(self): + count = 0 + for item in self: + count = count+1 + return count + + def titles(self): + for item in self.meta['title']: + yield item.value + + def creators(self): + for item in self.meta['creator']: + yield item.value diff --git a/src/calibre/ebooks/html/output.py b/src/calibre/ebooks/html/output.py new file mode 100644 index 0000000000..f1178d3259 --- /dev/null +++ b/src/calibre/ebooks/html/output.py @@ -0,0 +1,201 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, Fabian Grassl ' +__docformat__ = 'restructuredtext en' + +import os, re, shutil + +from os.path import dirname, abspath, relpath, exists + +from lxml import etree +from templite import Templite + +from calibre.ebooks.oeb.base import element +from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation +from calibre import CurrentDir +from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.utils.zipfile import ZipFile + +from urllib import unquote + +from calibre.ebooks.html.meta import EasyMeta + +class HTMLOutput(OutputFormatPlugin): + + name = 'HTML Output' + author = 'Fabian Grassl' + file_type = 'zip' + + options = set([ + OptionRecommendation(name='template_css', + help=_('CSS file used for the output instead of the default file')), + + OptionRecommendation(name='template_html_index', + help=_('Template used for generation of the html index file instead of the default file')), + + OptionRecommendation(name='template_html', + help=_('Template used for the generation of the html contents of the book instead of the default file')), + + OptionRecommendation(name='extract_to', + help=_('Extract the contents of the generated ZIP file to the directory of the generated ZIP file') + ), + ]) + + recommendations = set([('pretty_print', True, OptionRecommendation.HIGH)]) + + def generate_toc(self, oeb_book, ref_url, output_dir): + ''' + Generate table of contents + ''' + with CurrentDir(output_dir): + def build_node(current_node, parent=None): + if parent is None: + parent = etree.Element('ul') + elif len(current_node.nodes): + parent = element(parent, ('ul')) + for node in current_node.nodes: + point = element(parent, 'li') + href = relpath(abspath(unquote(node.href)), dirname(ref_url)) + link = element(point, 'a', href=href) + title = node.title + if title: + title = re.sub(r'\s+', ' ', title) + link.text=title + build_node(node, point) + return parent + wrap = etree.Element('div') + wrap.append(build_node(oeb_book.toc)) + return wrap + + def generate_html_toc(self, oeb_book, ref_url, output_dir): + root = self.generate_toc(oeb_book, ref_url, output_dir) + return etree.tostring(root, pretty_print=True, encoding='utf-8', + xml_declaration=True) + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + + # read template files + if opts.template_html_index is not None: + template_html_index_data = open(opts.template_html_index, 'rb').read() + else: + template_html_index_data = P('templates/html_export_default_index.tmpl', data=True) + + if opts.template_html is not None: + template_html_data = open(opts.template_html, 'rb').read() + else: + template_html_data = P('templates/html_export_default.tmpl', data=True) + + if opts.template_css is not None: + template_css_data = open(opts.template_css, 'rb').read() + else: + template_css_data = P('templates/html_export_default.css', data=True) + + template_html_index_data = template_html_index_data.decode('utf-8') + template_html_data = template_html_data.decode('utf-8') + template_css_data = template_css_data.decode('utf-8') + + self.log = log + self.opts = opts + meta = EasyMeta(oeb_book.metadata) + + tempdir = PersistentTemporaryDirectory() + output_file = os.path.join(tempdir, + os.path.basename(re.sub(r'\.zip', '', output_path)+'.html')) + output_dir = re.sub(r'\.html', '', output_file)+'_files' + + if not exists(output_dir): + os.makedirs(output_dir) + + css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css' + with open(css_path, 'wb') as f: + f.write(template_css_data.encode('utf-8')) + + with open(output_file, 'wb') as f: + html_toc = self.generate_html_toc(oeb_book, output_file, output_dir) + templite = Templite(template_html_index_data) + nextLink = oeb_book.spine[0].href + nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file)) + cssLink = relpath(abspath(css_path), dirname(output_file)) + tocUrl = relpath(output_file, dirname(output_file)) + t = templite.render(has_toc=bool(oeb_book.toc.count()), + toc=html_toc, meta=meta, nextLink=nextLink, + tocUrl=tocUrl, cssLink=cssLink) + f.write(t) + + with CurrentDir(output_dir): + for item in oeb_book.manifest: + path = abspath(unquote(item.href)) + dir = dirname(path) + if not exists(dir): + os.makedirs(dir) + if item.spine_position is not None: + with open(path, 'wb') as f: + pass + else: + with open(path, 'wb') as f: + f.write(str(item)) + item.unload_data_from_memory(memory=path) + + for item in oeb_book.spine: + path = abspath(unquote(item.href)) + dir = dirname(path) + root = item.data.getroottree() + + # get & clean HTML -data + head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] + head_content = etree.tostring(head, pretty_print=True, encoding='utf-8') + head_content = re.sub(r'\<\/?head.*\>', '', head_content) + head_content = re.sub(re.compile(r'\', re.M|re.S), '', head_content) + + # get & clean HTML -data + body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] + ebook_content = etree.tostring(body, pretty_print=True, encoding='utf-8') + ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content) + + # generate link to next page + if item.spine_position+1 < len(oeb_book.spine): + nextLink = oeb_book.spine[item.spine_position+1].href + nextLink = relpath(abspath(nextLink), dir) + else: + nextLink = None + + # generate link to previous page + if item.spine_position > 0: + prevLink = oeb_book.spine[item.spine_position-1].href + prevLink = relpath(abspath(prevLink), dir) + else: + prevLink = None + + cssLink = relpath(abspath(css_path), dir) + tocUrl = relpath(output_file, dir) + + # render template + templite = Templite(template_html_data) + toc = lambda: self.generate_html_toc(oeb_book, path, output_dir) + t = templite.render(ebookContent=ebook_content, + prevLink=prevLink, nextLink=nextLink, + has_toc=bool(oeb_book.toc.count()), toc=toc, + tocUrl=tocUrl, head_content=head_content, + meta=meta, cssLink=cssLink) + + # write html to file + with open(path, 'wb') as f: + f.write(t) + item.unload_data_from_memory(memory=path) + + zfile = ZipFile(output_path, "w") + zfile.add_dir(output_dir) + + if opts.extract_to: + if os.path.exists(opts.extract_to): + shutil.rmtree(opts.extract_to) + os.makedirs(opts.extract_to) + zfile.extractall(opts.extract_to) + self.log('Zip file extracted to', opts.extract_to) + + zfile.close() + + # cleanup temp dir + shutil.rmtree(tempdir) + + diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index 1ee77fec39..cf96c9732c 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -112,13 +112,12 @@ def get_metadata(br, asin, mi): def main(args=sys.argv): # Test xisbn - #print get_social_metadata('Learning Python', None, None, '8324616489') - #print + print get_social_metadata('Learning Python', None, None, '8324616489') + print # Test sophisticated comment formatting print get_social_metadata('Angels & Demons', None, None, '9781416580829') print - return # Random tests print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720') diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 29b2cc862a..56930ad2a9 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -275,7 +275,15 @@ class MobiMLizer(object): # does not exist lalalala if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': - return + id_ = elem.get('id', None) + if id_: + # Keep anchors so people can use display:none + # to generate hidden TOCs + elem.clear() + elem.text = None + elem.set('id', id_) + else: + return tag = barename(elem.tag) istate = copy.copy(istates[-1]) istate.rendered = False @@ -406,6 +414,12 @@ class MobiMLizer(object): parent = bstate.para if bstate.inline is None else bstate.inline if parent is not None: vtag = etree.SubElement(parent, XHTML(vtag)) + # Add anchors + for child in vbstate.body: + if child is not vbstate.para: + vtag.append(child) + else: + break for child in vbstate.para: vtag.append(child) return diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index b1de3b97a1..585b56c7b6 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -49,5 +49,3 @@ class OEBOutput(OutputFormatPlugin): with open(path, 'wb') as f: f.write(str(item)) item.unload_data_from_memory(memory=path) - - diff --git a/src/calibre/ebooks/snb/snbml.py b/src/calibre/ebooks/snb/snbml.py index d910b6751d..7c16eb5d90 100644 --- a/src/calibre/ebooks/snb/snbml.py +++ b/src/calibre/ebooks/snb/snbml.py @@ -101,11 +101,12 @@ class SNBMLizer(object): subitem = '' bodyTree = trees[subitem].find(".//body") for line in output.splitlines(): - if not line.find(CALIBRE_SNB_PRE_TAG) == 0: + pos = line.find(CALIBRE_SNB_PRE_TAG) + if pos == -1: line = line.strip(u' \t\n\r\u3000') else: etree.SubElement(bodyTree, "text").text = \ - etree.CDATA(line[len(CALIBRE_SNB_PRE_TAG):]) + etree.CDATA(line[pos+len(CALIBRE_SNB_PRE_TAG):]) continue if len(line) != 0: if line.find(CALIBRE_SNB_IMG_TAG) == 0: diff --git a/src/calibre/gui2/actions/view.py b/src/calibre/gui2/actions/view.py index 758aaa0e0a..5f4f7ce428 100644 --- a/src/calibre/gui2/actions/view.py +++ b/src/calibre/gui2/actions/view.py @@ -35,7 +35,6 @@ class ViewAction(InterfaceAction): self.qaction.setMenu(self.view_menu) ac.triggered.connect(self.view_specific_format, type=Qt.QueuedConnection) - def location_selected(self, loc): enabled = loc == 'library' for action in list(self.view_menu.actions())[1:]: @@ -134,6 +133,9 @@ class ViewAction(InterfaceAction): rows = self.gui.current_view().selectionModel().selectedRows() self._view_books(rows) + def view_triggered(self, index): + self._view_books([index]) + def view_specific_book(self, index): self._view_books([index]) diff --git a/src/calibre/gui2/convert/gui_conversion.py b/src/calibre/gui2/convert/gui_conversion.py index 69dabe28b8..116f09e429 100644 --- a/src/calibre/gui2/convert/gui_conversion.py +++ b/src/calibre/gui2/convert/gui_conversion.py @@ -28,6 +28,8 @@ def gui_catalog(fmt, title, dbspec, ids, out_file_name, sync, fmt_options, conne if log is None: log = Log() from calibre.library import db + from calibre.utils.config import prefs + prefs.refresh() db = db() db.catalog_plugin_on_device_temp_mapping = dbspec diff --git a/src/calibre/gui2/library/views.py b/src/calibre/gui2/library/views.py index 1c2a541116..525b10eaa1 100644 --- a/src/calibre/gui2/library/views.py +++ b/src/calibre/gui2/library/views.py @@ -50,6 +50,8 @@ class BooksView(QTableView): # {{{ def __init__(self, parent, modelcls=BooksModel): QTableView.__init__(self, parent) + self.setEditTriggers(self.SelectedClicked|self.EditKeyPressed) + self.drag_allowed = True self.setDragEnabled(True) self.setDragDropOverwriteMode(False) @@ -98,6 +100,8 @@ class BooksView(QTableView): # {{{ self._model.about_to_be_sorted.connect(self.about_to_be_sorted) self._model.sorting_done.connect(self.sorting_done) + self.doubleClicked.connect(parent.iactions['View'].view_triggered) + # Column Header Context Menu {{{ def column_header_context_handler(self, action=None, column=None): if not action or not column: diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index 6437f02cb6..5492c86fa9 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -128,7 +128,7 @@ class ContentServer(object): if want_mobile: return self.mobile() - return self.browse_toplevel() + return self.browse_catalog() def old(self, **kwargs): return self.static('index.html').replace('{prefix}', diff --git a/src/calibre/manual/gui.rst b/src/calibre/manual/gui.rst index 377c409bd0..4d75400b7d 100644 --- a/src/calibre/manual/gui.rst +++ b/src/calibre/manual/gui.rst @@ -338,6 +338,8 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes * - Keyboard Shortcut - Action + * - :kbd:`F2 (Enter in OS X)` + - Edit the metadata of the currently selected field in the book list. * - :kbd:`A` - Add Books * - :kbd:`C` diff --git a/src/templite/__init__.py b/src/templite/__init__.py new file mode 100644 index 0000000000..3586709b7b --- /dev/null +++ b/src/templite/__init__.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +# +# Templite+ +# A light-weight, fully functional, general purpose templating engine +# +# Copyright (c) 2009 joonis new media +# Author: Thimo Kraemer +# +# Based on Templite - Tomer Filiba +# http://code.activestate.com/recipes/496702/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. +# + +import sys, re + +class Templite(object): + auto_emit = re.compile('(^[\'\"])|(^[a-zA-Z0-9_\[\]\'\"]+$)') + + def __init__(self, template, start='${', end='}$'): + if len(start) != 2 or len(end) != 2: + raise ValueError('each delimiter must be two characters long') + delimiter = re.compile('%s(.*?)%s' % (re.escape(start), re.escape(end)), re.DOTALL) + offset = 0 + tokens = [] + for i, part in enumerate(delimiter.split(template)): + part = part.replace('\\'.join(list(start)), start) + part = part.replace('\\'.join(list(end)), end) + if i % 2 == 0: + if not part: continue + part = part.replace('\\', '\\\\').replace('"', '\\"') + part = '\t' * offset + 'emit("""%s""")' % part + else: + part = part.rstrip() + if not part: continue + if part.lstrip().startswith(':'): + if not offset: + raise SyntaxError('no block statement to terminate: ${%s}$' % part) + offset -= 1 + part = part.lstrip()[1:] + if not part.endswith(':'): continue + elif self.auto_emit.match(part.lstrip()): + part = 'emit(%s)' % part.lstrip() + lines = part.splitlines() + margin = min(len(l) - len(l.lstrip()) for l in lines if l.strip()) + part = '\n'.join('\t' * offset + l[margin:] for l in lines) + if part.endswith(':'): + offset += 1 + tokens.append(part) + if offset: + raise SyntaxError('%i block statement(s) not terminated' % offset) + self.__code = compile('\n'.join(tokens), '' % template[:20], 'exec') + + def render(self, __namespace=None, **kw): + """ + renders the template according to the given namespace. + __namespace - a dictionary serving as a namespace for evaluation + **kw - keyword arguments which are added to the namespace + """ + namespace = {} + if __namespace: namespace.update(__namespace) + if kw: namespace.update(kw) + namespace['emit'] = self.write + + __stdout = sys.stdout + sys.stdout = self + self.__output = [] + eval(self.__code, namespace) + sys.stdout = __stdout + return ''.join(self.__output) + + def write(self, *args): + for a in args: + self.__output.append(str(a))