diff --git a/resources/content_server/browse/browse.html b/resources/content_server/browse/browse.html index 4acc15f3ea..ef312334d9 100644 --- a/resources/content_server/browse/browse.html +++ b/resources/content_server/browse/browse.html @@ -8,24 +8,25 @@ - - - + + + - - + + + src="{prefix}/static/jquery_ui/js/jquery-ui-1.8.5.custom.min.js"> + src="{prefix}/static/jquery.multiselect.min.js"> - + - - + + + + +
- Show first set of books Show previous set of books              Show next set of books Show last set of books + Show first set of books Show previous set of books              Show next set of books Show last set of books
@@ -38,7 +39,7 @@
- Loading... Loading… + Loading... Loading…
diff --git a/resources/content_server/mobile.css b/resources/content_server/mobile.css index 0022b2a134..28d12bb6db 100644 --- a/resources/content_server/mobile.css +++ b/resources/content_server/mobile.css @@ -1,5 +1,9 @@ /* CSS for the mobile version of the content server webpage */ +.body { + font-family: sans-serif; +} + .navigation table.buttons { width: 100%; } @@ -53,6 +57,7 @@ div.navigation { } #listing td { padding: 0.25em; + vertical-align: middle; } #listing td.thumbnail { @@ -73,6 +78,7 @@ div.navigation { overflow: hidden; text-align: center; text-decoration: none; + vertical-align: middle; } #logo { @@ -83,4 +89,17 @@ div.navigation { clear: both; } +.data-container { + display: inline-block; + vertical-align: middle; +} +.first-line { + font-size: larger; + font-weight: bold; +} + +.second-line { + margin-top: 0.75ex; + display: block; +} diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index 86921886ad..0f570bab40 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -106,7 +106,8 @@ title_sort_articles=r'^(A|The|An)\s+' auto_connect_to_folder = '' -# Specify renaming rules for sony collections. Collections on Sonys are named +# Specify renaming rules for sony collections. This tweak is only applicable if +# metadata management is set to automatic. Collections on Sonys are named # depending upon whether the field is standard or custom. A collection derived # from a standard field is named for the value in that field. For example, if # the standard 'series' column contains the name 'Darkover', then the series @@ -137,6 +138,24 @@ auto_connect_to_folder = '' sony_collection_renaming_rules={} +# Specify how sony collections are sorted. This tweak is only applicable if +# metadata management is set to automatic. You can indicate which metadata is to +# be used to sort on a collection-by-collection basis. The format of the tweak +# is a list of metadata fields from which collections are made, followed by the +# name of the metadata field containing the sort value. +# Example: The following indicates that collections built from pubdate and tags +# are to be sorted by the value in the custom column '#mydate', that collections +# built from 'series' are to be sorted by 'series_index', and that all other +# collections are to be sorted by title. If a collection metadata field is not +# named, then if it is a series- based collection it is sorted by series order, +# otherwise it is sorted by title order. +# [(['pubdate', 'tags'],'#mydate'), (['series'],'series_index'), (['*'], 'title')] +# Note that the bracketing and parentheses are required. The syntax is +# [ ( [list of fields], sort field ) , ( [ list of fields ] , sort field ) ] +# Default: empty (no rules), so no collection attributes are named. +sony_collection_sorting_rules = [] + + # Create search terms to apply a query across several built-in search terms. # Syntax: {'new term':['existing term 1', 'term 2', ...], 'new':['old'...] ...} # Example: create the term 'myseries' that when used as myseries:foo would @@ -184,3 +203,11 @@ content_server_wont_display = [''] # level sorts, and if you are seeing a slowdown, reduce the value of this tweak. maximum_resort_levels = 5 +# Absolute path to a TTF font file to use as the font for the title and author +# when generating a default cover. Useful if the default font (Liberation +# Serif) does not contain glyphs for the language of the books in your library. +generate_cover_title_font = None + +# Absolute path to a TTF font file to use as the font for the footer in the +# default cover +generate_cover_foot_font = None diff --git a/resources/images/news/theecocolapse.png b/resources/images/news/theecocolapse.png new file mode 100644 index 0000000000..1c45ec14bf Binary files /dev/null and b/resources/images/news/theecocolapse.png differ diff --git a/resources/recipes/atlantic.recipe b/resources/recipes/atlantic.recipe index a41a931e37..5ae0f7d993 100644 --- a/resources/recipes/atlantic.recipe +++ b/resources/recipes/atlantic.recipe @@ -71,7 +71,9 @@ class TheAtlantic(BasicNewsRecipe): for poem in soup.findAll('div', attrs={'class':'poem'}): title = self.tag_to_string(poem.find('h4')) desc = self.tag_to_string(poem.find(attrs={'class':'author'})) - url = 'http://www.theatlantic.com'+poem.find('a')['href'] + url = poem.find('a')['href'] + if url.startswith('/'): + url = 'http://www.theatlantic.com' + url self.log('\tFound article:', title, 'at', url) self.log('\t\t', desc) poems.append({'title':title, 'url':url, 'description':desc, @@ -83,7 +85,9 @@ class TheAtlantic(BasicNewsRecipe): if div is not None: self.log('Found section: Advice') title = self.tag_to_string(div.find('h4')) - url = 'http://www.theatlantic.com'+div.find('a')['href'] + url = div.find('a')['href'] + if url.startswith('/'): + url = 'http://www.theatlantic.com' + url desc = self.tag_to_string(div.find('p')) self.log('\tFound article:', title, 'at', url) self.log('\t\t', desc) diff --git a/resources/recipes/cacm.recipe b/resources/recipes/cacm.recipe index 1618bae742..e4af9d2024 100644 --- a/resources/recipes/cacm.recipe +++ b/resources/recipes/cacm.recipe @@ -1,37 +1,37 @@ -import datetime -from calibre.web.feeds.news import BasicNewsRecipe - -class AdvancedUserRecipe1286242553(BasicNewsRecipe): - title = u'CACM' - oldest_article = 7 - max_articles_per_feed = 100 - needs_subscription = True - feeds = [(u'CACM', u'http://cacm.acm.org/magazine.rss')] - language = 'en' - __author__ = 'jonmisurda' - no_stylesheets = True - remove_tags = [ - dict(name='div', attrs={'class':['FeatureBox', 'ArticleComments', 'SideColumn', \ - 'LeftColumn', 'RightColumn', 'SiteSearch', 'MainNavBar','more', 'SubMenu', 'inner']}) - ] - cover_url_pattern = 'http://cacm.acm.org/magazines/%d/%d' - - def get_browser(self): - br = BasicNewsRecipe.get_browser() - if self.username is not None and self.password is not None: - br.open('https://cacm.acm.org/login') - br.select_form(nr=1) - br['current_member[user]'] = self.username - br['current_member[passwd]'] = self.password - br.submit() - return br - - def get_cover_url(self): - now = datetime.datetime.now() - - cover_url = None - soup = self.index_to_soup(self.cover_url_pattern % (now.year, now.month)) - cover_item = soup.find('img',attrs={'alt':'magazine cover image'}) - if cover_item: - cover_url = cover_item['src'] - return cover_url +import datetime +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1286242553(BasicNewsRecipe): + title = u'CACM' + oldest_article = 7 + max_articles_per_feed = 100 + needs_subscription = True + feeds = [(u'CACM', u'http://cacm.acm.org/magazine.rss')] + language = 'en' + __author__ = 'jonmisurda' + no_stylesheets = True + remove_tags = [ + dict(name='div', attrs={'class':['FeatureBox', 'ArticleComments', 'SideColumn', \ + 'LeftColumn', 'RightColumn', 'SiteSearch', 'MainNavBar','more', 'SubMenu', 'inner']}) + ] + cover_url_pattern = 'http://cacm.acm.org/magazines/%d/%d' + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('https://cacm.acm.org/login') + br.select_form(nr=1) + br['current_member[user]'] = self.username + br['current_member[passwd]'] = self.password + br.submit() + return br + + def get_cover_url(self): + now = datetime.datetime.now() + + cover_url = None + soup = self.index_to_soup(self.cover_url_pattern % (now.year, now.month)) + cover_item = soup.find('img',attrs={'alt':'magazine cover image'}) + if cover_item: + cover_url = cover_item['src'] + return cover_url diff --git a/resources/recipes/cubadebate.recipe b/resources/recipes/cubadebate.recipe index 88d06d412d..f8887b2672 100644 --- a/resources/recipes/cubadebate.recipe +++ b/resources/recipes/cubadebate.recipe @@ -1,9 +1,7 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2010, Darko Miletic ' ''' -newyorker.com +cubadebate.cu ''' from calibre.web.feeds.news import BasicNewsRecipe @@ -13,32 +11,44 @@ class CubaDebate(BasicNewsRecipe): __author__ = 'Darko Miletic' description = 'Contra el Terorismo Mediatico' oldest_article = 15 - language = 'es' - + language = 'es' max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False publisher = 'Cubadebate' category = 'news, politics, Cuba' encoding = 'utf-8' - extra_css = ' #BlogTitle{font-size: x-large; font-weight: bold} ' + masthead_url = 'http://www.cubadebate.cu/wp-content/themes/cubadebate/images/logo.gif' + publication_type = 'newsportal' + extra_css = """ + #BlogTitle{font-size: xx-large; font-weight: bold} + body{font-family: Verdana, Arial, Tahoma, sans-serif} + """ conversion_options = { 'comments' : description ,'tags' : category - ,'language' : 'es' + ,'language' : language ,'publisher' : publisher - ,'pretty_print': True } keep_only_tags = [dict(name='div', attrs={'id':'Outline'})] remove_tags_after = dict(name='div',attrs={'id':'BlogContent'}) - remove_tags = [dict(name='link')] + remove_tags = [ + dict(name=['link','base','embed','object','meta','iframe']) + ,dict(attrs={'id':'addthis_container'}) + ] feeds = [(u'Articulos', u'http://www.cubadebate.cu/feed/')] - + remove_attributes=['width','height','lang'] + def print_version(self, url): return url + 'print/' def preprocess_html(self, soup): - return self.adeify_images(soup) + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/resources/recipes/el_pais.recipe b/resources/recipes/el_pais.recipe index 1e2164b2af..2e358060b8 100644 --- a/resources/recipes/el_pais.recipe +++ b/resources/recipes/el_pais.recipe @@ -2,7 +2,7 @@ __license__ = 'GPL v3' __author__ = 'Jordi Balcells, based on an earlier version by Lorenzo Vigentini & Kovid Goyal' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -description = 'Main daily newspaper from Spain - v1.03 (03, September 2010)' +description = 'Main daily newspaper from Spain - v1.04 (19, October 2010)' __docformat__ = 'restructuredtext en' ''' @@ -32,19 +32,16 @@ class ElPais(BasicNewsRecipe): remove_javascript = True no_stylesheets = True - keep_only_tags = [ dict(name='div', attrs={'class':['cabecera_noticia','cabecera_noticia_reportaje','cabecera_noticia_opinion','contenido_noticia','caja_despiece','presentacion']})] - - extra_css = ''' - p{style:normal size:12 serif} + keep_only_tags = [ dict(name='div', attrs={'class':['cabecera_noticia_reportaje estirar','cabecera_noticia_opinion estirar','cabecera_noticia estirar','contenido_noticia','caja_despiece']})] - ''' + extra_css = ' p{text-align: justify; font-size: 100%} body{ text-align: left; font-family: serif; font-size: 100% } h1{ font-family: sans-serif; font-size:200%; font-weight: bolder; text-align: justify; } h2{ font-family: sans-serif; font-size:150%; font-weight: 500; text-align: justify } h3{ font-family: sans-serif; font-size:125%; font-weight: 500; text-align: justify } img{margin-bottom: 0.4em} ' remove_tags = [ dict(name='div', attrs={'class':['zona_superior','pie_enlaces_inferiores','contorno_f','ampliar']}), - dict(name='div', attrs={'class':['limpiar','mod_apoyo','borde_sup','votos','info_complementa','info_relacionada','buscador_m','nav_ant_sig']}), + dict(name='div', attrs={'class':['limpiar','mod_apoyo','borde_sup','votos estirar','info_complementa','info_relacionada','buscador_m','nav_ant_sig']}), dict(name='div', attrs={'id':['suscribirse suscrito','google_noticia','utilidades','coment','foros_not','pie','lomas','calendar']}), dict(name='p', attrs={'class':'nav_meses'}), - dict(attrs={'class':['enlaces_m','miniaturas_m']}) + dict(attrs={'class':['enlaces_m','miniaturas_m','nav_miniaturas_m']}) ] feeds = [ diff --git a/resources/recipes/foxnews.recipe b/resources/recipes/foxnews.recipe index e7e76390b5..916bd28ad2 100644 --- a/resources/recipes/foxnews.recipe +++ b/resources/recipes/foxnews.recipe @@ -4,7 +4,6 @@ __copyright__ = '2010, Darko Miletic ' foxnews.com ''' -import re from calibre.web.feeds.news import BasicNewsRecipe class FoxNews(BasicNewsRecipe): @@ -21,11 +20,10 @@ class FoxNews(BasicNewsRecipe): language = 'en' publication_type = 'newsportal' remove_empty_feeds = True - extra_css = ' body{font-family: Arial,sans-serif } img{margin-bottom: 0.4em} .caption{font-size: x-small} ' - - preprocess_regexps = [ - (re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '') - ] + extra_css = """ + body{font-family: Arial,sans-serif } + .caption{font-size: x-small} + """ conversion_options = { 'comment' : description @@ -34,27 +32,15 @@ class FoxNews(BasicNewsRecipe): , 'language' : language } - remove_attributes = ['xmlns'] - - keep_only_tags = [ - dict(name='div', attrs={'id' :['story','browse-story-content']}) - ,dict(name='div', attrs={'class':['posts articles','slideshow']}) - ,dict(name='h4' , attrs={'class':'storyDate'}) - ,dict(name='h1' , attrs={'xmlns:functx':'http://www.functx.com'}) - ,dict(name='div', attrs={'class':'authInfo'}) - ,dict(name='div', attrs={'id':'articleCont'}) - ] + remove_attributes = ['xmlns','lang'] remove_tags = [ - dict(name='div', attrs={'class':['share-links','quigo quigo2','share-text','storyControls','socShare','btm-links']}) - ,dict(name='div', attrs={'id' :['otherMedia','loomia_display','img-all-path','story-vcmId','story-url','pane-browse-story-comments','story_related']}) - ,dict(name='ul' , attrs={'class':['tools','tools alt','tools alt2','tabs']}) - ,dict(name='a' , attrs={'class':'join-discussion'}) - ,dict(name='ul' , attrs={'class':['tools','tools alt','tools alt2']}) - ,dict(name='p' , attrs={'class':'see_fullarchive'}) - ,dict(name=['object','embed','link','script']) + dict(name=['object','embed','link','script','iframe','meta','base']) + ,dict(attrs={'class':['user-control','url-description','ad-context']}) ] + remove_tags_before=dict(name='h1') + remove_tags_after =dict(attrs={'class':'url-description'}) feeds = [ (u'Latest Headlines', u'http://feeds.foxnews.com/foxnews/latest' ) @@ -67,8 +53,5 @@ class FoxNews(BasicNewsRecipe): ,(u'Entertainment' , u'http://feeds.foxnews.com/foxnews/entertainment' ) ] - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - return self.adeify_images(soup) - + def print_version(self, url): + return url + 'print' diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe new file mode 100644 index 0000000000..6a61405698 --- /dev/null +++ b/resources/recipes/ming_pao.recipe @@ -0,0 +1,64 @@ +cense__ = 'GPL v3' +__copyright__ = '2010, Eddie Lau' +''' +modified from Singtao Toronto calibre recipe by rty +''' + +import datetime +from calibre.web.feeds.recipes import BasicNewsRecipe + +class AdvancedUserRecipe1278063072(BasicNewsRecipe): + title = 'Ming Pao - Hong Kong' + oldest_article = 1 + max_articles_per_feed = 100 + __author__ = 'Eddie Lau' + description = 'Hong Kong Chinese Newspaper' + publisher = 'news.mingpao.com' + category = 'Chinese, News, Hong Kong' + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'zh' + encoding = 'Big5-HKSCS' + recursions = 0 + conversion_options = {'linearize_tables':True} + masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' + + keep_only_tags = [dict(name='h1'), + dict(attrs={'id':['newscontent01','newscontent02']})] + + def get_fetchdate(self): + dt_utc = datetime.datetime.utcnow() + # convert UTC to local hk time + dt_local = dt_utc - datetime.timedelta(-8.0/24) + return dt_local.strftime("%Y%m%d") + + def parse_index(self): + feeds = [] + dateStr = self.get_fetchdate() + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + return feeds + + def parse_section(self, url): + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet']}) + current_articles = [] + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + current_articles.append({'title': title, 'url': url, 'description':''}) + return current_articles + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(width=True): + del item['width'] + return soup + diff --git a/resources/recipes/new_scientist.recipe b/resources/recipes/new_scientist.recipe index 2e864565ff..02bbbe4d42 100644 --- a/resources/recipes/new_scientist.recipe +++ b/resources/recipes/new_scientist.recipe @@ -8,11 +8,11 @@ import re from calibre.web.feeds.news import BasicNewsRecipe class NewScientist(BasicNewsRecipe): - title = 'New Scientist - Online News' + title = 'New Scientist - Online News w. subscription' __author__ = 'Darko Miletic' description = 'Science news and science articles from New Scientist.' language = 'en' - publisher = 'New Scientist' + publisher = 'Reed Business Information Ltd.' category = 'science news, science articles, science jobs, drugs, cancer, depression, computer software' oldest_article = 7 max_articles_per_feed = 100 @@ -21,7 +21,12 @@ class NewScientist(BasicNewsRecipe): cover_url = 'http://www.newscientist.com/currentcover.jpg' masthead_url = 'http://www.newscientist.com/img/misc/ns_logo.jpg' encoding = 'utf-8' - extra_css = ' body{font-family: Arial,sans-serif} img{margin-bottom: 0.8em} ' + needs_subscription = 'optional' + extra_css = """ + body{font-family: Arial,sans-serif} + img{margin-bottom: 0.8em} + .quotebx{font-size: x-large; font-weight: bold; margin-right: 2em; margin-left: 2em} + """ conversion_options = { 'comment' : description @@ -33,15 +38,27 @@ class NewScientist(BasicNewsRecipe): keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})] + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.open('http://www.newscientist.com/') + if self.username is not None and self.password is not None: + br.open('https://www.newscientist.com/user/login?redirectURL=') + br.select_form(nr=2) + br['loginId' ] = self.username + br['password'] = self.password + br.submit() + return br + remove_tags = [ dict(name='div' , attrs={'class':['hldBd','adline','pnl','infotext' ]}) ,dict(name='div' , attrs={'id' :['compnl','artIssueInfo','artTools','comments','blgsocial','sharebtns']}) ,dict(name='p' , attrs={'class':['marker','infotext' ]}) ,dict(name='meta' , attrs={'name' :'description' }) - ,dict(name='a' , attrs={'rel' :'tag' }) + ,dict(name='a' , attrs={'rel' :'tag' }) + ,dict(name=['link','base','meta','iframe','object','embed']) ] remove_tags_after = dict(attrs={'class':['nbpcopy','comments']}) - remove_attributes = ['height','width'] + remove_attributes = ['height','width','lang'] feeds = [ (u'Latest Headlines' , u'http://feeds.newscientist.com/science-news' ) @@ -62,6 +79,8 @@ class NewScientist(BasicNewsRecipe): return url + '?full=true&print=true' def preprocess_html(self, soup): + for item in soup.findAll(['quote','quotetext']): + item.name='p' for tg in soup.findAll('a'): if tg.string == 'Home': tg.parent.extract() diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index 1814132667..5452ae1c6e 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -4,149 +4,79 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' nytimes.com -V5 - One picture per article, moved to top: -Headline -Image -Byline -Story ''' -import re, string, time +import string, re, time from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, Tag +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +def decode(self, src): + enc = 'utf-8' + if 'iso-8859-1' in src: + enc = 'cp1252' + return src.decode(enc, 'ignore') class NYTimes(BasicNewsRecipe): - title = 'The New York Times' - __author__ = 'GRiker' + title = u'New York Times' + __author__ = 'Kovid Goyal/Nick Redding' language = 'en' - requires_version = (0, 7, 5) + requires_version = (0, 6, 36) description = 'Daily news from the New York Times (subscription version)' - allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials', - 'New York','Business Day','Science Times','Sports','Dining','Arts', - 'Home','Styles','Sunday Business','Week In Review','Travel','Magazine', - 'Book Review','Weddings','Real Estate','Automobiles',"T Men's Fashion", - "T Women's Fashion"] - - # List of sections to exclude - # To add a section, copy the section name from the allSectionKeywords list above - # For example, to exclude 'Dining' and 'Weddings': - #excludeSectionKeywords = ['Dining','Weddings'] - excludeSectionKeywords = [] - - # List of sections to include (test and debug only) - # By default, any sections in today's paper that are not listed in excludeSectionKeywords - # are downloaded. fetch_only specifies that only certain sections are to be downloaded. - # This should only be used for testing and debugging. - # For example, to download only 'The Front Page' section: - # fetch_only = set(['The Front Page']) - fetch_only = set([]) - if fetch_only: - excludeSectionKeywords = list(set(allSectionKeywords) ^ fetch_only) - - # one_picture_per_article specifies that calibre should only use the first image - # from an article (if one exists). If one_picture_per_article = True, the image - # will be moved to a location between the headline and the byline. - # If one_picture_per_article = False, all images from the article will be included - # and shown in their original location. - one_picture_per_article = True - - timefmt = '' + timefmt = ' [%b %d]' needs_subscription = True remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') - remove_tags = [dict(attrs={'class':[ - 'articleFooter', - 'articleTools', - 'columnGroup doubleRule', - 'columnGroup singleRule', - 'columnGroup last', - 'columnGroup last', - 'doubleRule', - 'dottedLine', - 'entry-meta', - 'entry-response module', - 'icon enlargeThis', - 'leftNavTabs', - 'module box nav', - 'nextArticleLink', - 'nextArticleLink clearfix', - 'post-tools', - 'relatedSearchesModule', - 'side_tool', - 'singleAd', - 'subNavigation clearfix', - 'subNavigation tabContent active', - 'subNavigation tabContent active clearfix', - ]}), - dict(id=[ - 'adxLeaderboard', - 'archive', - 'articleExtras', - 'articleInline', - 'blog_sidebar', - 'businessSearchBar', - 'cCol', - 'entertainmentSearchBar', - 'footer', - 'header', - 'header_search', - 'login', - 'masthead', - 'masthead-nav', - 'memberTools', - 'navigation', - 'portfolioInline', - 'relatedArticles', - 'respond', - 'side_search', - 'side_index', - 'side_tool', - 'toolsRight', - ]), - dict(name=['script', 'noscript', 'style'])] - masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - cover_margins = (18,18,'grey99') + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink', + 'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta', + 'icon enlargeThis','columnGroup last','relatedSearchesModule']}), + dict({'class':re.compile('^subNavigation')}), + dict({'class':re.compile('^leaderboard')}), + dict({'class':re.compile('^module')}), + dict({'class':'metaFootnote'}), + dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead', + 'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline', + 'side_tool', 'side_index','header','readerReviewsCount','readerReviews', + 'relatedArticles', 'relatedTopics', 'adxSponLink']), + dict(name=['script', 'noscript', 'style','form','hr'])] + encoding = decode no_stylesheets = True - extra_css = '.headline {text-align: left;}\n \ - .byline {font-family: monospace; \ - text-align: left; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .dateline {font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .timestamp {font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .source {text-align: left;}\n \ - .image {text-align: center;}\n \ - .credit {text-align: right; \ - font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .articleBody {text-align: left;}\n \ - .authorId {text-align: left; \ - font-style: italic;}\n ' + extra_css = ''' + .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; } + .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } + .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .timestamp { font-size: small; } + .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + a:link {text-decoration: none; }''' def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: - try: - br.open('http://www.nytimes.com/auth/login') - br.select_form(name='login') - br['USERID'] = self.username - br['PASSWORD'] = self.password - raw = br.submit().read() - if 'Sorry, we could not find the combination you entered. Please try again.' in raw: - raise Exception('Your username and password are incorrect') - #open('/t/log.html', 'wb').write(raw) - except: - self.log("\nFailed to login") - + br.open('http://www.nytimes.com/auth/login') + br.select_form(name='login') + br['USERID'] = self.username + br['PASSWORD'] = self.password + raw = br.submit().read() + if 'Sorry, we could not find the combination you entered. Please try again.' in raw: + raise Exception('Your username and password are incorrect') + #open('/t/log.html', 'wb').write(raw) return br + def get_masthead_url(self): + masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' + #masthead = 'http://members.cox.net/nickredding/nytlogo.gif' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nMasthead unavailable") + masthead = None + return masthead + + def get_cover_url(self): cover = None st = time.localtime() @@ -162,316 +92,101 @@ class NYTimes(BasicNewsRecipe): cover = None return cover - def get_masthead_title(self): - return self.title - - def dump_ans(self, ans): - total_article_count = 0 - for section in ans : - if self.verbose: - self.log("section %s: %d articles" % (section[0], len(section[1])) ) - for article in section[1]: - total_article_count += 1 - if self.verbose: - self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('mac-roman','replace'), - article['url'].encode('mac-roman','replace'))) - self.log( "Queued %d articles" % total_article_count ) - - def dump_hex(self, src, length=16): - ''' Diagnostic ''' - FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) - N=0; result='' - while src: - s,src = src[:length],src[length:] - hexa = ' '.join(["%02X"%ord(x) for x in s]) - s = s.translate(FILTER) - result += "%04X %-*s %s\n" % (N, length*3, hexa, s) - N+=length - print result - - def fixChars(self,string): - # Replace lsquo (\x91) - fixed = re.sub("\x91","‘",string) - - # Replace rsquo (\x92) - fixed = re.sub("\x92","’",fixed) - - # Replace ldquo (\x93) - fixed = re.sub("\x93","“",fixed) - - # Replace rdquo (\x94) - fixed = re.sub("\x94","”",fixed) - - # Replace ndash (\x96) - fixed = re.sub("\x96","–",fixed) - - # Replace mdash (\x97) - fixed = re.sub("\x97","—",fixed) - - return fixed - - def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&","&", massaged) - return self.fixChars(massaged) - else: - return description + def short_title(self): + return 'New York Times' def parse_index(self): + self.encoding = 'cp1252' soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') + self.encoding = decode def feed_title(div): - return ''.join(div.findAll(text=True, recursive=False)).strip() + return ''.join(div.findAll(text=True, recursive=True)).strip() articles = {} key = None ans = [] - # Find each instance of class="section-headline", class="story", class="story headline" - for div in soup.findAll(True, - attrs={'class':['section-headline', 'story', 'story headline']}): + url_list = [] - if div['class'] == 'section-headline': - key = string.capwords(feed_title(div)) - if self.excludeSectionKeywords: - excluded = re.compile('|'.join(self.excludeSectionKeywords)) - if excluded.search(key): - self.log("Skipping section %s" % key) - continue - articles[key] = [] - ans.append(key) - - elif div['class'] in ['story', 'story headline'] : - a = div.find('a', href=True) - if not a: - continue - url = re.sub(r'\?.*', '', a['href']) - url += '?pagewanted=all' - - title = self.massageNCXText(self.tag_to_string(a, use_alt=True).strip()) - - description = '' - pubdate = strftime('%a, %d %b') - summary = div.find(True, attrs={'class':'summary'}) - if summary: - description = self.massageNCXText(self.tag_to_string(summary, use_alt=False)) - - author = '' - authorAttribution = div.find(True, attrs={'class':'storyheadline-author'}) + def handle_article(div): + a = div.find('a', href=True) + if not a: + return + url = re.sub(r'\?.*', '', a['href']) + if not url.startswith("http"): + return + if not url.endswith(".html"): + return + if 'podcast' in url: + return + url += '?pagewanted=all' + if url in url_list: + return + url_list.append(url) + title = self.tag_to_string(a, use_alt=True).strip() + #self.log("Title: %s" % title) + description = '' + pubdate = strftime('%a, %d %b') + summary = div.find(True, attrs={'class':'summary'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + author = '' + authorAttribution = div.find(True, attrs={'class':'byline'}) + if authorAttribution: + author = self.tag_to_string(authorAttribution, use_alt=False) + else: + authorAttribution = div.find(True, attrs={'class':'byline'}) if authorAttribution: author = self.tag_to_string(authorAttribution, use_alt=False) - else: - authorAttribution = div.find(True, attrs={'class':'byline'}) - if authorAttribution: - author = self.tag_to_string(authorAttribution, use_alt=False) - # Kill commas - Kindle switches to '&' - author = re.sub(',','',author) + feed = key if key is not None else 'Uncategorized' + if not articles.has_key(feed): + articles[feed] = [] + articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, author=author, + content='')) - feed = key if key is not None else 'Uncategorized' - if not articles.has_key(feed): - articles[feed] = [] - if not 'podcasts' in url: - articles[feed].append( - dict(title=title, url=url, date=pubdate, - description=description, author=author, - content='')) - ans = self.sort_index_by(ans, {'The Front Page':-1, - 'Dining In, Dining Out':1, - 'Obituaries':2}) + + + # Find each instance of class="section-headline", class="story", class="story headline" + for div in soup.findAll(True, + attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): + + if div['class'] in ['section-headline','sectionHeader']: + key = string.capwords(feed_title(div)) + articles[key] = [] + ans.append(key) + #self.log('Section: %s' % key) + + elif div['class'] in ['story', 'story headline'] : + handle_article(div) + elif div['class'] == 'headlinesOnly multiline flush': + for lidiv in div.findAll('li'): + handle_article(lidiv) + +# ans = self.sort_index_by(ans, {'The Front Page':-1, +# 'Dining In, Dining Out':1, +# 'Obituaries':2}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - self.dump_ans(ans) + return ans - def skip_ad_pages(self, soup): - # Skip ad pages served before actual article - skip_tag = soup.find(True, {'name':'skip'}) - if skip_tag is not None: - self.log.warn("Found forwarding link: %s" % skip_tag.parent['href']) - url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) - url += '?pagewanted=all' - self.log.warn("Skipping ad to article at '%s'" % url) - return self.index_to_soup(url, raw=True) - def preprocess_html(self, soup): - return self.strip_anchors(soup) + kicker_tag = soup.find(attrs={'class':'kicker'}) + if kicker_tag: + tagline = self.tag_to_string(kicker_tag) + #self.log("FOUND KICKER %s" % tagline) + if tagline=='Op-Ed Columnist': + img_div = soup.find('div','inlineImage module') + #self.log("Searching for photo") + if img_div: + img_div.extract() + #self.log("Photo deleted") + refresh = soup.find('meta', {'http-equiv':'refresh'}) + if refresh is None: + return soup + content = refresh.get('content').partition('=')[2] + raw = self.browser.open_novisit('http://www.nytimes.com'+content).read() + return BeautifulSoup(raw.decode('cp1252', 'replace')) - def postprocess_html(self,soup, True): - print "\npostprocess_html()\n" - - if self.one_picture_per_article: - # Remove all images after first - largeImg = soup.find(True, {'class':'articleSpanImage'}) - inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) - if largeImg: - for inlineImg in inlineImgs: - inlineImg.extract() - else: - if inlineImgs: - firstImg = inlineImgs[0] - for inlineImg in inlineImgs[1:]: - inlineImg.extract() - # Move firstImg after headline - cgFirst = soup.find(True, {'class':'columnGroup first'}) - if cgFirst: - # Strip all sibling NavigableStrings: noise - navstrings = cgFirst.findAll(text=True, recursive=False) - [ns.extract() for ns in navstrings] - headline_found = False - tag = cgFirst.find(True) - insertLoc = 0 - while True: - insertLoc += 1 - if hasattr(tag,'class') and tag['class'] == 'articleHeadline': - headline_found = True - break - tag = tag.nextSibling - if not tag: - headline_found = False - break - if headline_found: - cgFirst.insert(insertLoc,firstImg) - else: - self.log(">>> No class:'columnGroup first' found <<<") - # Change class="kicker" to

- kicker = soup.find(True, {'class':'kicker'}) - if kicker and kicker.contents and kicker.contents[0]: - h3Tag = Tag(soup, "h3") - h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker, - use_alt=False))) - kicker.replaceWith(h3Tag) - - # Change captions to italic -1 - for caption in soup.findAll(True, {'class':'caption'}) : - if caption and caption.contents[0]: - emTag = Tag(soup, "em") - c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() - mp_off = c.find("More Photos") - if mp_off >= 0: - c = c[:mp_off] - emTag.insert(0, c) - #hrTag = Tag(soup, 'hr') - #hrTag['class'] = 'caption_divider' - hrTag = Tag(soup, 'div') - hrTag['class'] = 'divider' - emTag.insert(1, hrTag) - caption.replaceWith(emTag) - - # Change to

- h1 = soup.find('h1') - if h1: - headline = h1.find("nyt_headline") - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - h1.replaceWith(tag) - else: - # Blog entry - replace headline, remove
tags - headline = soup.find('title') - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - soup.insert(0, tag) - hrs = soup.findAll('hr') - for hr in hrs: - hr.extract() - - # Change

to

- used in editorial blogs - masthead = soup.find("h1") - if masthead: - # Nuke the href - if masthead.a: - del(masthead.a['href']) - tag = Tag(soup, "h3") - tag.insert(0, self.fixChars(masthead.contents[0])) - masthead.replaceWith(tag) - - # Change to - for subhead in soup.findAll(True, {'class':'bold'}) : - if subhead.contents: - bTag = Tag(soup, "b") - bTag.insert(0, subhead.contents[0]) - subhead.replaceWith(bTag) - - # Synthesize a section header - dsk = soup.find('meta', attrs={'name':'dsk'}) - if dsk and dsk.has_key('content'): - hTag = Tag(soup,'h3') - hTag['class'] = 'section' - hTag.insert(0,NavigableString(dsk['content'])) - articleTag = soup.find(True, attrs={'id':'article'}) - if articleTag: - articleTag.insert(0,hTag) - - # Add class="articleBody" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'articleBody'}) - if divTag: - divTag['class'] = divTag['id'] - - # Add class="authorId" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'authorId'}) - if divTag and divTag.contents[0]: - tag = Tag(soup, "p") - tag['class'] = "authorId" - tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], - use_alt=False))) - divTag.replaceWith(tag) - - return soup - - def populate_article_metadata(self,article,soup,first): - ''' - Extract author and description from article, add to article metadata - ''' - def extract_author(soup): - byline = soup.find('meta',attrs={'name':['byl','CLMST']}) - if byline : - author = byline['content'] - else : - # Try for