From b2857225db8c34e21dac25d825770742c83c6633 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jan 2010 09:25:16 -0700 Subject: [PATCH 01/32] New recipe for The Kitsap Sun by Darko Miletic --- resources/images/news/kitsapun.png | Bin 0 -> 2356 bytes resources/recipes/kitsapun.recipe | 44 +++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 resources/images/news/kitsapun.png create mode 100644 resources/recipes/kitsapun.recipe diff --git a/resources/images/news/kitsapun.png b/resources/images/news/kitsapun.png new file mode 100644 index 0000000000000000000000000000000000000000..4b7b883d52e86f751173b3f135e782bf56bae334 GIT binary patch literal 2356 zcmV-43Cs40P)VyN(zoyXKKGuP@x#5(?PA39c z+;eB<%>4f|bLGiD-0fKvRmaS5ryC-MnYY(M2to{j7y~gt2muFjRTW&mgw ziXsw1L6kreGb|f;VET-Cj2$0gd~~E!+Bars@(hNCr*rhsE|L^T6vV;IFjKUyN#$gQ zC(mp>2FFJmS*aLg+Kh7m7(YIOBo6G_%)#BS@sXQ|n;+MF1Qd*@xvZ z1_w$+oFq&U$f}4qb)A_q>pYfRaSQIw{(o*@$L2o~sYqqxwW1J; z*r#!GfE2|6Be{{?P&Yy!SS@wDzh;Ucs5;plH85+G&pG!}-uwPr0NDNJi|pL`6e$Hl z43vdXq=x&kJ%9*AhZF={AS5BEAT5O&fO|(r1cXMR*NV9}iAS9|=U&3%<*RUa-dg_{ zhxTlyOpzD^WfF?i8W01t$ib{hL~gkO5(ch-f-7!{_dQ?+mT64rGtbD`Saj(s<}O%{ zo3VYvV;tVQnSmk^0+a=Wyj zK0>L^87kRlX?kfKloh|+jYyeVdVe1SI+u$Bed-{cC` z@Y>s77X(wvDQB?ws;@CHJd@F*dwBhY2MN@qB2WaUj8Ft420@VKVEua1iSmA_#ob8( z?|6?XffE?$gAQPL>X|IL_8Y`vh_R!4d427#2~v?#Bm`%mpml#kF_G4MIW_JfBtq*| z6(iMJHQ>vj?wBcPM%+8~lc%4<`#yXZu^0ki-?r66pP)z)+=w0;PKyu`+!b-~5Sxl> z+V&Fxw~B22kQgC|6KEQZAP__d(vrTNu^b|nL*4!Si*Ld_QP;Vt@s_)3mb)@8rVXgu z6T3CQ?pI*MCW5(OF1Q$mVeVMVnm$@Z1j`fM_2A@LELm|k*%G~8P1dF&R<%-RLsd~T z+*;nZSZI0&>kKrr1Hsz`H^EIh&;!7zMe`)Y+*j zC1=GHOcis*%+PjP+f|u6>KXB{Be*urgysw?t%Wtx_H#`~B#a%|&6elxMe7rum^^bn z%RYJ!>WN;jV!W4`sFbR1`U5jMPuOWwv!kk8f?7aLFcia3b<`bco6Zmd5(G74?8sg= zKl5`ePjt_w&RW1FH+-M$1$C{R5w$uww+j0qQ0;JlJDUB~;9?y`oEmq}e(~0i+v^nQ-`6qseNr|epMl_hnxk=xB*Y~rbyKixHUBD~cYRCao zBS?eNAb}7fAw)z1?hRH_3QQc`!}_Ow(qN+9Gi}a7F1Yem>Rh8{^eUrQE4{W=Q}a_W z;Ot|sxnNQ7T(+*x$cgqw*jR6BQ-N70R&VdMvc3XBoJe4 z(`Q6g5ibb>jv{K5QP}t93*f@#AG@o=(!!-H5eaO5_5o6n7_R~YMQAd7()(lZhr87L z%ImOkJM2AdjMp_;VXT+Pc2WpLh)ww-TW-8k6gW;X5ZSx^&un<+SKa;?0*jWdWcdx> zBAa8L=~WdIb^UOzx$nlW|D`Jrsj6%CydBteFf%w9Da(?#-yUOSn%a5btc4eI_*l)5 z4Kh?l3ah~*B93LH2#Hk$#wKmALjNcUUjzE$5{v;X9FKOv4; z#!MJJ)}yM9bM}HoEI5DhDc=Pn#bs=LV=FVK3G=2W=1$9$1Bs&Th=&Si*Vd=O;i^yG z)4lhei*MK@B9Cl(_MwMw{n2yIX+yqx%Qc=cGz;2OR(|{@KKuDEGdw)`l#OF!qpV%C zn)NTQW9M62IBRyvnR82K4#PmK2r*(=(b_q8=}NBt^!K&^PXLb>MX`yOdF`SxYw`FTKK_r z*MH#``}jZOKU7_5ka2wYAOHXWC3HntbYx+4WjbSWWnpw>05UK!FfA}KEipJ$FgZFh zGdeUgEigAaFffzMOmYAK03~!qSaf7zbY(hiZ)9m^c>ppnF)%GKF)cATR53C-G%z|a aHZ3qWIxsLzr}_o}0000 Date: Mon, 18 Jan 2010 09:33:46 -0700 Subject: [PATCH 02/32] Improved recipe for FTD --- resources/recipes/ftd.recipe | 31 ++++-- resources/recipes/ledevoir.recipe | 158 +++++++++++++++--------------- 2 files changed, 100 insertions(+), 89 deletions(-) diff --git a/resources/recipes/ftd.recipe b/resources/recipes/ftd.recipe index db53a3ed19..d18f9bdc56 100644 --- a/resources/recipes/ftd.recipe +++ b/resources/recipes/ftd.recipe @@ -9,16 +9,16 @@ from calibre.web.feeds.news import BasicNewsRecipe class FTDe(BasicNewsRecipe): - + title = 'FTD' description = 'Financial Times Deutschland' __author__ = 'Oliver Niesner' use_embedded_content = False timefmt = ' [%d %b %Y]' - language = 'de' + language = _('German') max_articles_per_feed = 40 no_stylesheets = True - + remove_tags = [dict(id='navi_top'), dict(id='topbanner'), dict(id='seitenkopf'), @@ -28,8 +28,13 @@ class FTDe(BasicNewsRecipe): dict(id='ADS_Top'), dict(id='spinner'), dict(id='ftd-contentad'), + dict(id='ftd-promo'), dict(id='nava-50009007-1-0'), dict(id='navli-50009007-1-0'), + dict(id='Box5000534-0-0-0'), + dict(id='ExpV-1-0-0-1'), + dict(id='ExpV-1-0-0-0'), + dict(id='PollExpV-2-0-0-0'), dict(id='starRating'), dict(id='saveRating'), dict(id='yLayer'), @@ -44,14 +49,19 @@ class FTDe(BasicNewsRecipe): dict(name='ul', attrs={'class':'nav'}), dict(name='p', attrs={'class':'articleOptionHead'}), dict(name='p', attrs={'class':'articleOptionFoot'}), + dict(name='p', attrs={'class':'moreInfo'}), dict(name='div', attrs={'class':'chartBox'}), dict(name='div', attrs={'class':'ratingOpt starRatingContainer articleOptionFootFrame'}), dict(name='div', attrs={'class':'box boxArticleBasic boxComments boxTransparent'}), - dict(name='div', attrs={'class':'box boxNavTabs '}), + dict(name='div', attrs={'class':'box boxNavTabs'}), + dict(name='div', attrs={'class':'boxMMRgtLow'}), dict(name='span', attrs={'class':'vote_455857'}), dict(name='div', attrs={'class':'relatedhalb'}), dict(name='div', attrs={'class':'box boxListScrollOutline'}), + dict(name='div', attrs={'class':'box boxPhotoshow boxImgWide'}), + dict(name='div', attrs={'class':'box boxTeaser'}), dict(name='div', attrs={'class':'tagCloud'}), + dict(name='div', attrs={'class':'pollView'}), dict(name='div', attrs={'class':'box boxArticleBasic boxNavTabsOutline'}), dict(name='div', attrs={'class':'ftdHpNav'}), dict(name='div', attrs={'class':'ftdHead'}), @@ -67,11 +77,12 @@ class FTDe(BasicNewsRecipe): dict(name='div', attrs={'class':'wertungoben'}), dict(name='div', attrs={'class':'artikelfuss'}), dict(name='a', attrs={'class':'rating'}), + dict(name='a', attrs={'href':'#rt'}), dict(name='div', attrs={'class':'articleOptionFootFrame'}), dict(name='div', attrs={'class':'artikelsplitfaq'})] - remove_tags_after = [dict(name='a', attrs={'class':'more'})] - - feeds = [ ('Finanzen', 'http://www.ftd.de/rss2/finanzen/maerkte'), + #remove_tags_after = [dict(name='a', attrs={'class':'more'})] + + feeds = [ ('Finanzen', 'http://www.ftd.de/rss2/finanzen/maerkte'), ('Meinungshungrige', 'http://www.ftd.de/rss2/meinungshungrige'), ('Unternehmen', 'http://www.ftd.de/rss2/unternehmen'), ('Politik', 'http://www.ftd.de/rss2/politik'), @@ -82,8 +93,8 @@ class FTDe(BasicNewsRecipe): ('Auto', 'http://www.ftd.de/rss2/auto'), ('Lifestyle', 'http://www.ftd.de/rss2/lifestyle') - ] - + ] + def print_version(self, url): - return url + '?mode=print' + return url.replace('.html', '.html?mode=print') diff --git a/resources/recipes/ledevoir.recipe b/resources/recipes/ledevoir.recipe index c9dbd8c5d7..97b33c43a7 100644 --- a/resources/recipes/ledevoir.recipe +++ b/resources/recipes/ledevoir.recipe @@ -1,79 +1,79 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__author__ = 'Lorenzo Vigentini' -__copyright__ = '2009, Lorenzo Vigentini ' -__version__ = 'v1.01' -__date__ = '14, January 2010' -__description__ = 'Canadian Paper ' - -''' -http://www.ledevoir.com/ -''' - -from calibre.web.feeds.news import BasicNewsRecipe - -class ledevoir(BasicNewsRecipe): - author = 'Lorenzo Vigentini' - description = 'Canadian Paper' - - cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif' - title = u'Le Devoir' - publisher = 'leDevoir.com' - category = 'News, finance, economy, politics' - - language = 'fr' - encoding = 'utf-8' - timefmt = '[%a, %d %b, %Y]' - - max_articles_per_feed = 50 - use_embedded_content = False - recursion = 10 - - remove_javascript = True - no_stylesheets = True - - keep_only_tags = [ - dict(name='div', attrs={'id':'article'}), - dict(name='ul', attrs={'id':'ariane'}) - ] - - remove_tags = [ - dict(name='div', attrs={'id':'dialog'}), - dict(name='div', attrs={'class':['interesse_actions','reactions']}), - dict(name='ul', attrs={'class':'mots_cles'}), - dict(name='a', attrs={'class':'haut'}), - dict(name='h5', attrs={'class':'interesse_actions'}) - ] - - feeds = [ - (u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'), - (u'Edition complete', 'http://feeds2.feedburner.com/fluxdudevoir'), - (u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'), - (u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'), - (u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'), - (u'International', 'http://www.ledevoir.com/rss/section/international.xml?id=76'), - (u'Culture', 'http://www.ledevoir.com/rss/section/culture.xml?id=48'), - (u'Environnement', 'http://www.ledevoir.com/rss/section/environnement.xml?id=78'), - (u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'), - (u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'), - (u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'), - (u'Loisirs', 'http://www.ledevoir.com/rss/section/loisirs.xml?id=50') - ] - - extra_css = ''' - h1 {color:#1C1E7C;font-family:Times,Georgia,serif;font-size:1.85em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:bold;line-height:1.2em;margin:0 0 5px;} - h2 {color:#333333;font-family:Times,Georgia,serif;font-size:1.5em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:normal;line-height:1.2em;margin:0 0 5px;} - h3 {color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px;} - h4 {color:#333333; font-family:Arial,Helvetica,sans-serif;font-size:13px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; } - h5 {color:#333333; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; text-transform:uppercase;} - .specs {line-height:1em;margin:1px 0;} - .specs span.auteur {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;} - .specs span.auteur a, - .specs span.auteur span {text-transform:uppercase;color:#787878;} - .specs .date {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;} - ul#ariane {list-style-type:none;margin:0;padding:5px 0 8px 0;font:0.85em/1.2em Arial, Verdana, sans-serif;color:#2E2E2E;border-bottom:10px solid #fff;} - ul#ariane li {display:inline;} - ul#ariane a {color:#2E2E2E;text-decoration:underline;} - .credit {color:#787878;font-size:0.71em;line-height:1.1em;font-weight:bold;} - .texte {font-size:1.15em;line-height:1.4em;margin-bottom:17px;} - ''' +#!/usr/bin/env python +__license__ = 'GPL v3' +__author__ = 'Lorenzo Vigentini' +__copyright__ = '2009, Lorenzo Vigentini ' +__version__ = 'v1.01' +__date__ = '14, January 2010' +__description__ = 'Canadian Paper ' + +''' +http://www.ledevoir.com/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ledevoir(BasicNewsRecipe): + author = 'Lorenzo Vigentini' + description = 'Canadian Paper' + + cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif' + title = u'Le Devoir' + publisher = 'leDevoir.com' + category = 'News, finance, economy, politics' + + language = 'fr' + encoding = 'utf-8' + timefmt = '[%a, %d %b, %Y]' + + max_articles_per_feed = 50 + use_embedded_content = False + recursion = 10 + + remove_javascript = True + no_stylesheets = True + + keep_only_tags = [ + dict(name='div', attrs={'id':'article'}), + dict(name='ul', attrs={'id':'ariane'}) + ] + + remove_tags = [ + dict(name='div', attrs={'id':'dialog'}), + dict(name='div', attrs={'class':['interesse_actions','reactions']}), + dict(name='ul', attrs={'class':'mots_cles'}), + dict(name='a', attrs={'class':'haut'}), + dict(name='h5', attrs={'class':'interesse_actions'}) + ] + + feeds = [ + (u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'), + (u'Edition complete', 'http://feeds2.feedburner.com/fluxdudevoir'), + (u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'), + (u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'), + (u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'), + (u'International', 'http://www.ledevoir.com/rss/section/international.xml?id=76'), + (u'Culture', 'http://www.ledevoir.com/rss/section/culture.xml?id=48'), + (u'Environnement', 'http://www.ledevoir.com/rss/section/environnement.xml?id=78'), + (u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'), + (u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'), + (u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'), + (u'Loisirs', 'http://www.ledevoir.com/rss/section/loisirs.xml?id=50') + ] + + extra_css = ''' + h1 {color:#1C1E7C;font-family:Times,Georgia,serif;font-size:1.85em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:bold;line-height:1.2em;margin:0 0 5px;} + h2 {color:#333333;font-family:Times,Georgia,serif;font-size:1.5em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:normal;line-height:1.2em;margin:0 0 5px;} + h3 {color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px;} + h4 {color:#333333; font-family:Arial,Helvetica,sans-serif;font-size:13px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; } + h5 {color:#333333; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; text-transform:uppercase;} + .specs {line-height:1em;margin:1px 0;} + .specs span.auteur {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;} + .specs span.auteur a, + .specs span.auteur span {text-transform:uppercase;color:#787878;} + .specs .date {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;} + ul#ariane {list-style-type:none;margin:0;padding:5px 0 8px 0;font:0.85em/1.2em Arial, Verdana, sans-serif;color:#2E2E2E;border-bottom:10px solid #fff;} + ul#ariane li {display:inline;} + ul#ariane a {color:#2E2E2E;text-decoration:underline;} + .credit {color:#787878;font-size:0.71em;line-height:1.1em;font-weight:bold;} + .texte {font-size:1.15em;line-height:1.4em;margin-bottom:17px;} + ''' From 7535f5862712a54e4dd3ae54132f2a3060229306 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jan 2010 09:46:52 -0700 Subject: [PATCH 03/32] New recipe for The Yemen Times by kwetal --- resources/recipes/yementimes.recipe | 125 ++++++++++++++++++++++++++++ src/calibre/utils/localization.py | 1 + 2 files changed, 126 insertions(+) create mode 100644 resources/recipes/yementimes.recipe diff --git a/resources/recipes/yementimes.recipe b/resources/recipes/yementimes.recipe new file mode 100644 index 0000000000..426c9a748c --- /dev/null +++ b/resources/recipes/yementimes.recipe @@ -0,0 +1,125 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag + +class YemenTimesRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en_YE' + country = 'YE' + version = 1 + + title = u'Yemen Times' + publisher = u'yementimes.com' + category = u'News, Opinion, Yemen' + description = u'Award winning weekly from Yemen, promoting press freedom, professional journalism and the defense of human rights.' + + oldest_article = 7 + max_articles_per_feed = 100 + use_embedded_content = False + encoding = 'utf-8' + + remove_empty_feeds = True + no_stylesheets = True + remove_javascript = True + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'div', attrs = {'id': 'ctl00_ContentPlaceHolder1_MAINNEWS0_Panel1', + 'class': 'DMAIN2'})) + remove_attributes = ['style'] + + INDEX = 'http://www.yementimes.com/' + feeds = [] + feeds.append((u'Our Viewpoint', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=6&pnm=OUR%20VIEWPOINT')) + feeds.append((u'Local News', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=3&pnm=Local%20news')) + feeds.append((u'Their News', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=80&pnm=Their%20News')) + feeds.append((u'Report', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=8&pnm=report')) + feeds.append((u'Health', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=51&pnm=health')) + feeds.append((u'Interview', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=77&pnm=interview')) + feeds.append((u'Opinion', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=7&pnm=opinion')) + feeds.append((u'Business', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=5&pnm=business')) + feeds.append((u'Op-Ed', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=81&pnm=Op-Ed')) + feeds.append((u'Culture', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=75&pnm=Culture')) + feeds.append((u'Readers View', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=4&pnm=Readers%20View')) + feeds.append((u'Variety', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=9&pnm=Variety')) + feeds.append((u'Education', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=57&pnm=Education')) + + extra_css = ''' + body {font-family:verdana, arial, helvetica, geneva, sans-serif;} + div.yemen_byline {font-size: medium; font-weight: bold;} + div.yemen_date {font-size: small; color: #666666; margin-bottom: 0.6em;} + .yemen_caption {font-size: x-small; font-style: italic; color: #696969;} + ''' + + conversion_options = {'comments': description, 'tags': category, 'language': 'en', + 'publisher': publisher, 'linearize_tables': True} + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.set_handle_gzip(True) + + return br + + def parse_index(self): + answer = [] + for feed_title, feed in self.feeds: + soup = self.index_to_soup(feed) + + newsbox = soup.find('div', 'newsbox') + main = newsbox.findNextSibling('table') + + articles = [] + for li in main.findAll('li'): + title = self.tag_to_string(li.a) + url = self.INDEX + li.a['href'] + articles.append({'title': title, 'date': None, 'url': url, 'description': '
 '}) + + answer.append((feed_title, articles)) + + return answer + + def preprocess_html(self, soup): + freshSoup = self.getFreshSoup(soup) + + headline = soup.find('div', attrs = {'id': 'DVMTIT'}) + if headline: + div = headline.findNext('div', attrs = {'id': 'DVTOP'}) + img = None + if div: + img = div.find('img') + + headline.name = 'h1' + freshSoup.body.append(headline) + if img is not None: + freshSoup.body.append(img) + + byline = soup.find('div', attrs = {'id': 'DVTIT'}) + if byline: + date_el = byline.find('span') + if date_el: + pub_date = self.tag_to_string(date_el) + date = Tag(soup, 'div', attrs = [('class', 'yemen_date')]) + date.append(pub_date) + date_el.extract() + + raw = '
'.join(['%s' % (part) for part in byline.findAll(text = True)]) + author = BeautifulSoup('') + + if date is not None: + freshSoup.body.append(date) + freshSoup.body.append(author) + + story = soup.find('div', attrs = {'id': 'DVDET'}) + if story: + for table in story.findAll('table'): + if table.find('img'): + table['class'] = 'yemen_caption' + + freshSoup.body.append(story) + + return freshSoup + + def getFreshSoup(self, oldSoup): + freshSoup = BeautifulSoup('') + if oldSoup.head.title: + freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title)) + return freshSoup diff --git a/src/calibre/utils/localization.py b/src/calibre/utils/localization.py index 1ade012b1f..90f86a8368 100644 --- a/src/calibre/utils/localization.py +++ b/src/calibre/utils/localization.py @@ -104,6 +104,7 @@ _extra_lang_codes = { 'en_CY' : _('English (Cyprus)'), 'en_PK' : _('English (Pakistan)'), 'en_SG' : _('English (Singapore)'), + 'en_YE' : _('English (Yemen)'), 'de_AT' : _('German (AT)'), 'nl' : _('Dutch (NL)'), 'nl_BE' : _('Dutch (BE)'), From e71b23e5c37a9d90139416772455695859bb8404 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Jan 2010 09:48:12 -0700 Subject: [PATCH 04/32] ebook-meta: Fix setting of series metadata --- src/calibre/ebooks/metadata/cli.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/calibre/ebooks/metadata/cli.py b/src/calibre/ebooks/metadata/cli.py index e4ea1a3931..5de8b76c43 100644 --- a/src/calibre/ebooks/metadata/cli.py +++ b/src/calibre/ebooks/metadata/cli.py @@ -128,6 +128,10 @@ def do_set_metadata(opts, mi, stream, stream_type): mi.title_sort = title_sort(opts.title) if getattr(opts, 'tags', None) is not None: mi.tags = [t.strip() for t in opts.tags.split(',')] + if getattr(opts, 'series', None) is not None: + mi.series = opts.series.strip() + if getattr(opts, 'series_index', None) is not None: + mi.series_index = float(opts.series_index.strip()) if getattr(opts, 'cover', None) is not None: ext = os.path.splitext(opts.cover)[1].replace('.', '').upper() From e8d1e03f737ccbb765276c608f474fe0670a9ea6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jan 2010 08:31:41 -0700 Subject: [PATCH 05/32] Fix #4607 (Updated recipe for The Amercian Spectator) --- resources/recipes/amspec.recipe | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/resources/recipes/amspec.recipe b/resources/recipes/amspec.recipe index 62bec5ae18..e5a76a4f86 100644 --- a/resources/recipes/amspec.recipe +++ b/resources/recipes/amspec.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2010, Darko Miletic ' ''' spectator.org ''' @@ -11,20 +9,22 @@ from calibre.web.feeds.news import BasicNewsRecipe class TheAmericanSpectator(BasicNewsRecipe): title = 'The American Spectator' __author__ = 'Darko Miletic' - language = 'en' - description = 'News from USA' + category = 'news, politics, USA, world' + publisher = 'The American Spectator' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + language = 'en' INDEX = 'http://spectator.org' - html2lrf_options = [ - '--comment' , description - , '--category' , 'news, politics, USA' - , '--publisher' , title - ] + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } keep_only_tags = [ dict(name='div', attrs={'class':'post inner'}) @@ -33,13 +33,11 @@ class TheAmericanSpectator(BasicNewsRecipe): remove_tags = [ dict(name='object') - ,dict(name='div', attrs={'class':'col3' }) - ,dict(name='div', attrs={'class':'post-options' }) - ,dict(name='p' , attrs={'class':'letter-editor'}) - ,dict(name='div', attrs={'class':'social' }) + ,dict(name='div', attrs={'class':['col3','post-options','social']}) + ,dict(name='p' , attrs={'class':['letter-editor','meta']}) ] - feeds = [ (u'Articles', u'http://feedproxy.google.com/amspecarticles')] + feeds = [ (u'Articles', u'http://feeds.feedburner.com/amspecarticles')] def get_cover_url(self): cover_url = None @@ -53,3 +51,7 @@ class TheAmericanSpectator(BasicNewsRecipe): def print_version(self, url): return url + '/print' + + def get_article_url(self, article): + return article.get('guid', None) + From 0e0863103a12bed55e75bc48ae30f51ea80bbc48 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jan 2010 08:43:52 -0700 Subject: [PATCH 06/32] ... --- src/calibre/ebooks/pdf/reflow.py | 13 ++++++++++--- src/calibre/gui2/ui.py | 2 +- src/calibre/library/database2.py | 9 +++++++-- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 1b2149cf3a..f4bdb9c7ac 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -20,6 +20,10 @@ class Font(object): class Column(object): + # A column contains an element is the element bulges out to + # the left or the right by at most HFUZZ*col width. + HFUZZ = 0.2 + def __init__(self): self.left = self.right = self.top = self.bottom = 0 self.width = self.height = 0 @@ -41,6 +45,10 @@ class Column(object): for x in self.elements: yield x + def contains(self, elem): + return elem.left > self.left - self.HFUZZ*self.width and \ + elem.right < self.right + self.HFUZZ*self.width + class Element(object): def __eq__(self, other): @@ -238,11 +246,10 @@ class Page(object): return columns def find_elements_in_row_of(self, x): - interval = Interval(x.top - self.YFUZZ * self.average_text_height, + interval = Interval(x.top, x.top + self.YFUZZ*(1+self.average_text_height)) h_interval = Interval(x.left, x.right) - m = max(0, x.idx-15) - for y in self.elements[m:x.idx+15]: + for y in self.elements[x.idx:x.idx+15]: if y is not x: y_interval = Interval(y.top, y.bottom) x_interval = Interval(y.left, y.right) diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py index 6cbae7f7b0..98b416eaa3 100644 --- a/src/calibre/gui2/ui.py +++ b/src/calibre/gui2/ui.py @@ -1361,7 +1361,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): def generate_catalog(self): rows = self.library_view.selectionModel().selectedRows() - if not rows: + if not rows or len(rows) < 3: rows = xrange(self.library_view.model().rowCount(QModelIndex())) ids = map(self.library_view.model().id, rows) dbspec = None diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 84638410c7..db75516292 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -1634,13 +1634,15 @@ class LibraryDatabase2(LibraryDatabase): for i in iter(self): yield i[x] - def get_data_as_dict(self, prefix=None, authors_as_string=False): + def get_data_as_dict(self, prefix=None, authors_as_string=False, ids=None): ''' Return all metadata stored in the database as a dict. Includes paths to the cover and each format. :param prefix: The prefix for all paths. By default, the prefix is the absolute path to the library folder. + :param ids: Set of ids to return the data for. If None return data for + all entries in database. ''' if prefix is None: prefix = self.library_path @@ -1649,12 +1651,15 @@ class LibraryDatabase2(LibraryDatabase): 'isbn', 'uuid', 'pubdate']) data = [] for record in self.data: + db_id = record[FIELD_MAP['id']] + if ids is not None and db_id not in ids: + continue if record is None: continue x = {} for field in FIELDS: x[field] = record[FIELD_MAP[field]] data.append(x) - x['id'] = record[FIELD_MAP['id']] + x['id'] = db_id x['formats'] = [] if not x['authors']: x['authors'] = _('Unknown') From c290fc198c013a90dffc4a643e4dedfe53192c16 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jan 2010 08:51:11 -0700 Subject: [PATCH 07/32] ... --- src/calibre/library/database2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index db75516292..7b0f7a083e 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -1651,10 +1651,10 @@ class LibraryDatabase2(LibraryDatabase): 'isbn', 'uuid', 'pubdate']) data = [] for record in self.data: + if record is None: continue db_id = record[FIELD_MAP['id']] if ids is not None and db_id not in ids: continue - if record is None: continue x = {} for field in FIELDS: x[field] = record[FIELD_MAP[field]] From 4332e1a6412c9896b7e0130a823321013a8b8c59 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jan 2010 09:31:20 -0700 Subject: [PATCH 08/32] Faster recipe for The BBC by Darko Miletic --- resources/recipes/bbc_fast.recipe | 60 +++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 resources/recipes/bbc_fast.recipe diff --git a/resources/recipes/bbc_fast.recipe b/resources/recipes/bbc_fast.recipe new file mode 100644 index 0000000000..12ae9ce1eb --- /dev/null +++ b/resources/recipes/bbc_fast.recipe @@ -0,0 +1,60 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +news.bbc.co.uk +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + +class BBC(BasicNewsRecipe): + title = 'BBC News (fast)' + __author__ = 'Darko Miletic' + description = 'News from UK. A much faster version that does not download pictures' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + #delay = 1 + use_embedded_content = False + encoding = 'utf8' + publisher = 'BBC' + category = 'news, UK, world' + language = 'en' + extra_css = ' body{ font-family: sans-serif; } .headline{font-size: xx-large; font-weight: bold} .ibox{display: block; margin: 20px 50px; padding: 10px; border: 1px solid } ' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + remove_tags_before = dict(name='div',attrs={'class':'headline'}) + remove_tags_after = dict(name='div', attrs={'class':'footer'}) + remove_tags = [ + dict(name=['object','link','script','iframe']) + ,dict(name='div', attrs={'class':'footer'}) + ] + + feeds = [ + ('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'), + ('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'), + ('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'), + ('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'), + ('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'), + ('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'), + ('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'), + ('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'), + ('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'), + ('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'), + ('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'), + ('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'), + ('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'), + ] + + def print_version(self, url): + emp,sep,rstrip = url.partition('http://') + return 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/' + rstrip + + def get_article_url(self, article): + return article.get('guid', None) + From f226bdfe9daa3f64c9d0f4c750face8fa7ef9549 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jan 2010 12:42:44 -0700 Subject: [PATCH 09/32] New recipe for The Reader's Digest by BrianG --- resources/recipes/readers_digest.recipe | 188 ++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 resources/recipes/readers_digest.recipe diff --git a/resources/recipes/readers_digest.recipe b/resources/recipes/readers_digest.recipe new file mode 100644 index 0000000000..3689ca4c53 --- /dev/null +++ b/resources/recipes/readers_digest.recipe @@ -0,0 +1,188 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +''' +''' +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.web.feeds import Feed + + +class ReadersDigest(BasicNewsRecipe): + + title = 'Readers Digest' + __author__ = 'BrianG' + language = 'en' + description = 'Readers Digest Feeds' + no_stylesheets = True + use_embedded_content = False + oldest_article = 60 + max_articles_per_feed = 200 + + language = 'en' + remove_javascript = True + + extra_css = ''' h1 {font-family:georgia,serif;color:#000000;} + .mainHd{font-family:georgia,serif;color:#000000;} + h2 {font-family:Arial,Sans-serif;} + .name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; } + .date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;} + .byline{font-family:Arial,Sans-serif; font-size:x-small ;} + .photoBkt{ font-size:x-small ;} + .vertPhoto{font-size:x-small ;} + .credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;} + .credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;} + .artTxt{font-family:georgia,serif;} + .caption{font-family:georgia,serif; font-size:x-small;color:#333333;} + .credit{font-family:georgia,serif; font-size:x-small;color:#999999;} + a:link{color:#CC0000;} + .breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;} + ''' + + + remove_tags = [ + dict(name='h4', attrs={'class':'close'}), + dict(name='div', attrs={'class':'fromLine'}), + dict(name='img', attrs={'class':'colorTag'}), + dict(name='div', attrs={'id':'sponsorArticleHeader'}), + dict(name='div', attrs={'class':'horizontalAd'}), + dict(name='div', attrs={'id':'imageCounterLeft'}), + dict(name='div', attrs={'id':'commentsPrint'}) + ] + + + feeds = [ + ('New in RD', 'http://feeds.rd.com/ReadersDigest'), + ('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'), + ('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'), + ('Blogs','http://feeds.rd.com/ReadersDigestBlogs') + ] + + cover_url = 'http://www.rd.com/images/logo-main-rd.gif' + + + +#------------------------------------------------------------------------------------------------- + + def print_version(self, url): + + # Get the identity number of the current article and append it to the root print URL + + if url.find('/article') > 0: + ident = url[url.find('/article')+8:url.find('.html?')-4] + url = 'http://www.rd.com/content/printContent.do?contentId=' + ident + + elif url.find('/post') > 0: + + # in this case, have to get the page itself to derive the Print page. + soup = self.index_to_soup(url) + newsoup = soup.find('ul',attrs={'class':'printBlock'}) + url = 'http://www.rd.com' + newsoup('a')[0]['href'] + url = url[0:url.find('&Keep')] + + return url + +#------------------------------------------------------------------------------------------------- + + def parse_index(self): + + pages = [ + ('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}), + # useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}), + ('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'}) + + ] + + feeds = [] + + for page in pages: + section, url, divider, attrList = page + newArticles = self.page_parse(url, divider, attrList) + feeds.append((section,newArticles)) + + # after the pages of the site have been processed, parse several RSS feeds for additional sections + newfeeds = Feed() + newfeeds = self.parse_rss() + + + # The utility code in parse_rss returns a Feed object. Convert each feed/article combination into a form suitable + # for this module (parse_index). + + for feed in newfeeds: + newArticles = [] + for article in feed.articles: + newArt = { + 'title' : article.title, + 'url' : article.url, + 'date' : article.date, + 'description' : article.text_summary + } + newArticles.append(newArt) + + + # New and Blogs should be the first two feeds. + if feed.title == 'New in RD': + feeds.insert(0,(feed.title,newArticles)) + elif feed.title == 'Blogs': + feeds.insert(1,(feed.title,newArticles)) + else: + feeds.append((feed.title,newArticles)) + + + return feeds + +#------------------------------------------------------------------------------------------------- + + def page_parse(self, mainurl, divider, attrList): + + articles = [] + mainsoup = self.index_to_soup(mainurl) + for item in mainsoup.findAll(attrs=attrList): + newArticle = { + 'title' : item('img')[0]['alt'], + 'url' : 'http://www.rd.com'+item('a')[0]['href'], + 'date' : '', + 'description' : '' + } + articles.append(newArticle) + + + + return articles + + + +#------------------------------------------------------------------------------------------------- + + def parse_rss (self): + + # Do the "official" parse_feeds first + feeds = BasicNewsRecipe.parse_feeds(self) + + + # Loop thru the articles in all feeds to find articles with "recipe" in it + recipeArticles = [] + for curfeed in feeds: + delList = [] + for a,curarticle in enumerate(curfeed.articles): + if curarticle.title.upper().find('RECIPE') >= 0: + recipeArticles.append(curarticle) + delList.append(curarticle) + if len(delList)>0: + for d in delList: + index = curfeed.articles.index(d) + curfeed.articles[index:index+1] = [] + + # If there are any recipes found, create a new Feed object and append. + if len(recipeArticles) > 0: + pfeed = Feed() + pfeed.title = 'Recipes' + pfeed.descrition = 'Recipe Feed (Virtual)' + pfeed.image_url = None + pfeed.oldest_article = 30 + pfeed.id_counter = len(recipeArticles) + # Create a new Feed, add the recipe articles, and then append + # to "official" list of feeds + pfeed.articles = recipeArticles[:] + feeds.append(pfeed) + + return feeds + From b72a0652ff26ad7770825f940d588db696076f9f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jan 2010 12:57:49 -0700 Subject: [PATCH 10/32] E-book viewer: Fit images to viewer window (can be turned off via Preferences) --- resources/viewer/images.js | 22 ++++++++++++++++ src/calibre/gui2/viewer/config.ui | 13 +++++++--- src/calibre/gui2/viewer/documentview.py | 34 ++++++++++++------------- 3 files changed, 48 insertions(+), 21 deletions(-) create mode 100644 resources/viewer/images.js diff --git a/resources/viewer/images.js b/resources/viewer/images.js new file mode 100644 index 0000000000..cc6e6d47e5 --- /dev/null +++ b/resources/viewer/images.js @@ -0,0 +1,22 @@ +/* + * images management + * Copyright 2008 Kovid Goyal + * License: GNU GPL v3 + */ + +function scale_images() { + $("img:visible").each(function() { + var offset = $(this).offset(); + $(this).css("max-width", (window.innerWidth-offset.left-5)+"px"); + $(this).css("max-height", (window.innerHeight-5)+"px"); + }); +} + +function setup_image_scaling_handlers() { + scale_images(); + $(window).resize(function(){ + scale_images(); + }); +} + + diff --git a/src/calibre/gui2/viewer/config.ui b/src/calibre/gui2/viewer/config.ui index fe1dc85c93..d6e71c77d2 100644 --- a/src/calibre/gui2/viewer/config.ui +++ b/src/calibre/gui2/viewer/config.ui @@ -7,14 +7,14 @@ 0 0 479 - 574 + 606 Configure Ebook viewer - + :/images/config.svg:/images/config.svg @@ -164,7 +164,7 @@ - + Remember last used &window size @@ -218,6 +218,13 @@ + + + + &Resize images larger than the viewer window (needs restart) + + + diff --git a/src/calibre/gui2/viewer/documentview.py b/src/calibre/gui2/viewer/documentview.py index b35e28121a..790b1c4f2f 100644 --- a/src/calibre/gui2/viewer/documentview.py +++ b/src/calibre/gui2/viewer/documentview.py @@ -10,7 +10,7 @@ from base64 import b64encode from PyQt4.Qt import QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, \ QPainter, QPalette, QBrush, QFontDatabase, QDialog, \ QColor, QPoint, QImage, QRegion, QVariant, QIcon, \ - QFont, QObject, QApplication, pyqtSignature, QAction + QFont, pyqtSignature, QAction from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings from calibre.utils.config import Config, StringConfig @@ -21,7 +21,7 @@ from calibre.constants import iswindows from calibre import prints, guess_type from calibre.gui2.viewer.keys import SHORTCUTS -bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = None +bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = images =None def load_builtin_fonts(): base = P('fonts/liberation/*.ttf') @@ -42,6 +42,8 @@ def config(defaults=None): help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.')) c.add_opt('max_view_width', default=6000, help=_('Maximum width of the viewer window, in pixels.')) + c.add_opt('fit_images', default=True, + help=_('Resize images larger than the viewer window to fit inside it')) c.add_opt('hyphenate', default=False, help=_('Hyphenate text')) c.add_opt('hyphenate_default_lang', default='en', help=_('Default language for hyphenation rules')) @@ -59,20 +61,6 @@ def config(defaults=None): return c -class PythonJS(QObject): - - def __init__(self, callback): - QObject.__init__(self, QApplication.instance()) - self.setObjectName("py_bridge") - self._callback = callback - - @pyqtSignature("QString") - def callback(self, msg): - print "callback called" - self._callback(msg) - - - class ConfigDialog(QDialog, Ui_Dialog): def __init__(self, shortcuts, parent=None): @@ -110,6 +98,7 @@ class ConfigDialog(QDialog, Ui_Dialog): self.shortcut_config = ShortcutConfig(shortcuts, parent=self) p = self.tabs.widget(1) p.layout().addWidget(self.shortcut_config) + self.opt_fit_images.setChecked(opts.fit_images) def accept(self, *args): @@ -122,6 +111,7 @@ class ConfigDialog(QDialog, Ui_Dialog): c.set('standard_font', {0:'serif', 1:'sans', 2:'mono'}[self.standard_font.currentIndex()]) c.set('user_css', unicode(self.css.toPlainText())) c.set('remember_window_size', self.opt_remember_window_size.isChecked()) + c.set('fit_images', self.opt_fit_images.isChecked()) c.set('max_view_width', int(self.max_view_width.value())) c.set('hyphenate', self.hyphenate.isChecked()) idx = self.hyphenate_default_lang.currentIndex() @@ -157,7 +147,6 @@ class Document(QWebPage): self.setObjectName("py_bridge") self.debug_javascript = False self.current_language = None - #self.js_bridge = PythonJS(self.js_callback) self.setLinkDelegationPolicy(self.DelegateAllLinks) self.scroll_marks = [] @@ -197,9 +186,14 @@ class Document(QWebPage): opts = config().parse() self.hyphenate = opts.hyphenate self.hyphenate_default_lang = opts.hyphenate_default_lang + self.do_fit_images = opts.fit_images + + def fit_images(self): + if self.do_fit_images: + self.javascript('setup_image_scaling_handlers()') def load_javascript_libraries(self): - global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator + global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator, images self.mainFrame().addToJavaScriptWindowObject("py_bridge", self) if jquery is None: jquery = P('content_server/jquery.js', data=True) @@ -215,6 +209,9 @@ class Document(QWebPage): if referencing is None: referencing = P('viewer/referencing.js', data=True) self.javascript(referencing) + if images is None: + images = P('viewer/images.js', data=True) + self.javascript(images) if hyphenation is None: hyphenation = P('viewer/hyphenation.js', data=True) self.javascript(hyphenation) @@ -541,6 +538,7 @@ class DocumentView(QWebView): return self.loading_url = None self.document.set_bottom_padding(0) + self.document.fit_images() self._size_hint = self.document.mainFrame().contentsSize() scrolled = False if self.to_bottom: From e936eb0c84a19765796dfa070b11878be6aaa8f7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jan 2010 14:02:16 -0700 Subject: [PATCH 11/32] MOBI Input: Rescale img width and height attributes that were specified in em units. Fixes #4608 (Built in viewre picture issue) --- resources/viewer/images.js | 1 + src/calibre/ebooks/mobi/reader.py | 10 ++++++++++ src/calibre/ebooks/oeb/stylizer.py | 4 ++++ 3 files changed, 15 insertions(+) diff --git a/resources/viewer/images.js b/resources/viewer/images.js index cc6e6d47e5..ea68009254 100644 --- a/resources/viewer/images.js +++ b/resources/viewer/images.js @@ -7,6 +7,7 @@ function scale_images() { $("img:visible").each(function() { var offset = $(this).offset(); + //window.py_bridge.debug(window.getComputedStyle(this, '').getPropertyValue('max-width')); $(this).css("max-width", (window.innerWidth-offset.left-5)+"px"); $(this).css("max-height", (window.innerHeight-5)+"px"); }); diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index b8557aea98..4f894ce088 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -563,6 +563,16 @@ class MobiReader(object): recindex = attrib.pop(attr, None) or recindex if recindex is not None: attrib['src'] = 'images/%s.jpg' % recindex + for attr in ('width', 'height'): + if attr in attrib: + val = attrib[attr] + if val.lower().endswith('em'): + try: + nval = float(val[:-2]) + nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile + attrib[attr] = "%dpx"%int(nval) + except: + del attrib[attr] elif tag.tag == 'pre': if not tag.text: tag.tag = 'div' diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 9f50796615..d0e394b9e5 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -411,6 +411,7 @@ class Style(object): return result def _unit_convert(self, value, base=None, font=None): + ' Return value in pts' if isinstance(value, (int, long, float)): return value try: @@ -447,6 +448,9 @@ class Style(object): result = value * 0.40 return result + def pt_to_px(self, value): + return (self._profile.dpi / 72.0) * value + @property def fontSize(self): def normalize_fontsize(value, base): From ab2c79226d1123e660cce6935d9c9a6a3273d3b7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 19 Jan 2010 15:46:16 -0700 Subject: [PATCH 12/32] New recipe for Algemeen Dagblad by kwetal --- resources/images/news/ad.png | Bin 0 -> 569 bytes resources/recipes/ad.recipe | 86 +++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 resources/images/news/ad.png create mode 100644 resources/recipes/ad.recipe diff --git a/resources/images/news/ad.png b/resources/images/news/ad.png new file mode 100644 index 0000000000000000000000000000000000000000..8b017910dff028186af8079acec32baebde0fe90 GIT binary patch literal 569 zcmV-90>=G`P)Px#24YJ`L;(K){{a7>y{D4^000SaNLh0L01FcU01FcV0GgZ_00007bV*G`2igG> z6gLiREyA||00Fs4L_t(I%f*vFYZOrw#eWl*Ku9qxSiKc|D{Lb&u;3poBKZUcs|2eB z8~;FLRRp{J01M9Am{R-%yHmzur;t>Uu!CA8NTzV6SwRIQ5S3%`W_M?aT?!9ec=x_{ z?>qP1^E^r76j5aR|HS@CTB%A~RV@G{9KovyYqKfam>&=w%;D*L2Ng8+;rm;z(GFH{c z-)Qmu!yf*#9^Br9-w8bJP$7y*ImWUGa3UFdh@ULM%@+RsRk*R~;%`%@3$?qm2OI56 zlD=sqwYv(bDKxl;E#uQbT5n40O;t!UxCU$X0R#;Id;(}K!Sfv+w$H=m2FVcwKNEPL z!dA!Key>nsl0#rl>O4)r2BpB41ik`iQut$`%Dj~S%WwP(-S7}zQQ>S$00000NkvXX Hu0mjf Date: Wed, 20 Jan 2010 10:01:29 -0500 Subject: [PATCH 13/32] Swap author option splits name at comma if comma found, otherwise splits at first space --- src/calibre/ebooks/metadata/meta.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index 479b513ea5..1c22481263 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -134,7 +134,10 @@ def metadata_from_filename(name, pat=None): mi.authors = aus if prefs['swap_author_names'] and mi.authors: def swap(a): - parts = a.split() + if ',' in a: + parts = a.split(',', 1) + else: + parts = a.split(' ', 1) if len(parts) > 1: t = parts[-1] parts = parts[:-1] From d5b5af5b3fed17af056ed58e51c04c1d93266869 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jan 2010 09:30:43 -0700 Subject: [PATCH 14/32] New recipe for Pajamas Media by Krittika Goyal --- resources/recipes/pajama.recipe | 48 +++++++++++++++++++++++++++++++++ src/calibre/utils/config.py | 1 + 2 files changed, 49 insertions(+) create mode 100644 resources/recipes/pajama.recipe diff --git a/resources/recipes/pajama.recipe b/resources/recipes/pajama.recipe new file mode 100644 index 0000000000..8c5ba74317 --- /dev/null +++ b/resources/recipes/pajama.recipe @@ -0,0 +1,48 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class PajamasMedia(BasicNewsRecipe): + title = u'Pajamas Media' + description = u'Provides exclusive news and opinion for forty countries.' + language = 'en' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + recursions = 1 + match_regexps = [r'http://pajamasmedia.com/blog/.*/2/$'] + #encoding = 'latin1' + + remove_stylesheets = True + #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) + remove_tags_after = dict(name='div', attrs={'class':'paged-nav'}) + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':['pages']}), + #dict(name='div', attrs={'id':['bookmark']}), + #dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}), + #dict(name='ul', attrs={'class':'articleTools'}), + ] + + feeds = [ +('pajamas Media', + 'http://feeds.feedburner.com/PajamasMedia'), + +] + + def preprocess_html(self, soup): + story = soup.find(name='div', attrs={'id':'innerpage-content'}) + #td = heading.findParent(name='td') + #td.extract() + + soup = BeautifulSoup('t') + body = soup.find(name='body') + body.insert(0, story) + return soup + + def postprocess_html(self, soup, first): + if not first: + h = soup.find(attrs={'class':'innerpage-header'}) + if h: h.extract() + auth = soup.find(attrs={'class':'author'}) + if auth: auth.extract() + return soup diff --git a/src/calibre/utils/config.py b/src/calibre/utils/config.py index a0e5632cb7..22e31c3005 100644 --- a/src/calibre/utils/config.py +++ b/src/calibre/utils/config.py @@ -524,6 +524,7 @@ class DynamicConfig(dict): pass except: import traceback + print 'Failed to unpickle stored object:' traceback.print_exc() d = {} self.clear() From 0b5541edc2e4c666d7cbbfdd0d197cbaeb69b87f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jan 2010 09:32:51 -0700 Subject: [PATCH 15/32] Updated recipe for Physics Today --- resources/recipes/physics_today.recipe | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/resources/recipes/physics_today.recipe b/resources/recipes/physics_today.recipe index 9b236ff23c..d1ce17cf32 100644 --- a/resources/recipes/physics_today.recipe +++ b/resources/recipes/physics_today.recipe @@ -8,8 +8,7 @@ class Physicstoday(BasicNewsRecipe): description = u'Physics Today magazine' publisher = 'American Institute of Physics' category = 'Physics' - language = 'en' - + language = 'en' cover_url = strftime('http://ptonline.aip.org/journals/doc/PHTOAD-home/jrnls/images/medcover%m_%Y.jpg') oldest_article = 30 max_articles_per_feed = 100 @@ -30,11 +29,11 @@ class Physicstoday(BasicNewsRecipe): def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: - br.open('http://www.physicstoday.org/pt/sso_login.jsp') - br.select_form(name='login') + br.open('http://ptonline.aip.org/journals/doc/PHTOAD-home/pt_login.jsp?fl=f') + br.select_form(name='login_form') br['username'] = self.username br['password'] = self.password br.submit() return br - feeds = [(u'All', u'http://www.physicstoday.org/feed.xml')] \ No newline at end of file + feeds = [(u'All', u'http://www.physicstoday.org/feed.xml')] From 096735a456c79cefd6ebe0a5c9df1b142757845c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jan 2010 14:58:07 -0700 Subject: [PATCH 16/32] Improved free WSJ recipe --- resources/recipes/wsj_free.recipe | 170 ++++++++++++++++++------------ 1 file changed, 103 insertions(+), 67 deletions(-) diff --git a/resources/recipes/wsj_free.recipe b/resources/recipes/wsj_free.recipe index b05da400ae..495a7c343b 100644 --- a/resources/recipes/wsj_free.recipe +++ b/resources/recipes/wsj_free.recipe @@ -3,47 +3,122 @@ __license__ = 'GPL v3' ''' -online.wsj.com.com +online.wsj.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag, NavigableString +from datetime import timedelta, datetime, date class WSJ(BasicNewsRecipe): # formatting adapted from original recipe by Kovid Goyal and Sujata Raman title = u'Wall Street Journal (free)' __author__ = 'Nick Redding' language = 'en' - description = ('All the free content from the Wall Street Journal (business' - ', financial and political news)') + description = ('All the free content from the Wall Street Journal (business, financial and political news)') + no_stylesheets = True timefmt = ' [%b %d]' - extra_css = '''h1{font-size:large; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;} - h2{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} - .subhead{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} - .insettipUnit {font-family:Arial,Sans-serif;font-size:xx-small;} - .targetCaption{font-size:x-small; font-family:Arial,Helvetica,sans-serif;} - .article{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} - .tagline { ont-size:xx-small;} - .dateStamp {font-family:Arial,Helvetica,sans-serif;} - h3{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} - .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small; list-style-type: none;} + + # customization notes: delete sections you are not interested in + # set omit_paid_content to False if you want the paid content article snippets + # set oldest_article to the maximum number of days back from today to include articles + sectionlist = [ + ['/home-page','Front Page'], + ['/public/page/news-opinion-commentary.html','Commentary'], + ['/public/page/news-global-world.html','World News'], + ['/public/page/news-world-business.html','US News'], + ['/public/page/news-business-us.html','Business'], + ['/public/page/news-financial-markets-stock.html','Markets'], + ['/public/page/news-tech-technology.html','Technology'], + ['/public/page/news-personal-finance.html','Personal Finnce'], + ['/public/page/news-lifestyle-arts-entertainment.html','Life & Style'], + ['/public/page/news-real-estate-homes.html','Real Estate'], + ['/public/page/news-career-jobs.html','Careers'], + ['/public/page/news-small-business-marketing.html','Small Business'] + ] + oldest_article = 2 + omit_paid_content = True + + extra_css = '''h1{font-size:large; font-family:Times,serif;} + h2{font-family:Times,serif; font-size:small; font-style:italic;} + .subhead{font-family:Times,serif; font-size:small; font-style:italic;} + .insettipUnit {font-family:Times,serif;font-size:xx-small;} + .targetCaption{font-size:x-small; font-family:Times,serif; font-style:italic; margin-top: 0.25em;} + .article{font-family:Times,serif; font-size:x-small;} + .tagline { font-size:xx-small;} + .dateStamp {font-family:Times,serif;} + h3{font-family:Times,serif; font-size:xx-small;} + .byline {font-family:Times,serif; font-size:xx-small; list-style-type: none;} .metadataType-articleCredits {list-style-type: none;} - h6{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic;} + h6{font-family:Times,serif; font-size:small; font-style:italic;} .paperLocation{font-size:xx-small;}''' - remove_tags_before = dict(name='h1') - remove_tags = [ dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", - "articleTabs_tab_interactive","articleTabs_tab_video", - "articleTabs_tab_map","articleTabs_tab_slideshow"]), - {'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map', - 'insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', 'tooltip', - 'adSummary', 'nav-inline','insetFullBracket']}, - dict(rel='shortcut icon'), + + remove_tags_before = dict({'class':re.compile('^articleHeadlineBox')}) + remove_tags = [ dict({'id':re.compile('^articleTabs_tab_')}), + #dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", + # "articleTabs_tab_interactive","articleTabs_tab_video", + # "articleTabs_tab_map","articleTabs_tab_slideshow"]), + {'class': ['footer_columns','network','insetCol3wide','interactive','video','slideshow','map', + 'insettip','insetClose','more_in', "insetContent", + # 'articleTools_bottom','articleTools_bottom mjArticleTools', + 'aTools', 'tooltip', + 'adSummary', 'nav-inline','insetFullBracket']}, + dict({'class':re.compile('^articleTools_bottom')}), + dict(rel='shortcut icon') ] remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}] + def get_browser(self): + br = BasicNewsRecipe.get_browser() + return br def preprocess_html(self,soup): + # check if article is too old + datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")}) + if datetag: + dateline_string = self.tag_to_string(datetag,False) + date_items = dateline_string.split(',') + datestring = date_items[0]+date_items[1] + article_date = datetime.strptime(datestring.title(),"%B %d %Y") + earliest_date = date.today() - timedelta(days=self.oldest_article) + if article_date.date() < earliest_date: + self.log("Skipping article dated %s" % datestring) + return None + datetag.parent.extract() + + # place dateline in article heading + + bylinetag = soup.find('h3','byline') + if bylinetag: + h3bylinetag = bylinetag + else: + bylinetag = soup.find('li','byline') + if bylinetag: + h3bylinetag = bylinetag.h3 + if not h3bylinetag: + h3bylinetag = bylinetag + bylinetag = bylinetag.parent + if bylinetag: + if h3bylinetag.a: + bylinetext = 'By '+self.tag_to_string(h3bylinetag.a,False) + else: + bylinetext = self.tag_to_string(h3bylinetag,False) + h3byline = Tag(soup,'h3',[('class','byline')]) + if bylinetext.isspace() or (bylinetext == ''): + h3byline.insert(0,NavigableString(date_items[0]+','+date_items[1])) + else: + h3byline.insert(0,NavigableString(bylinetext+u'\u2014'+date_items[0]+','+date_items[1])) + bylinetag.replaceWith(h3byline) + else: + headlinetag = soup.find('div',attrs={'class' : re.compile("^articleHeadlineBox")}) + if headlinetag: + dateline = Tag(soup,'h3', [('class','byline')]) + dateline.insert(0,NavigableString(date_items[0]+','+date_items[1])) + headlinetag.insert(len(headlinetag),dateline) + else: # if no date tag, don't process this page--it's not a news item + return None # This gets rid of the annoying superfluous bullet symbol preceding columnist bylines ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'}) if ultag: @@ -58,7 +133,7 @@ class WSJ(BasicNewsRecipe): key = None ans = [] - def parse_index_page(page_name,page_title,omit_paid_content): + def parse_index_page(page_name,page_title): def article_title(tag): atag = tag.find('h2') # title is usually in an h2 tag @@ -119,7 +194,6 @@ class WSJ(BasicNewsRecipe): soup = self.index_to_soup(pageurl) # Find each instance of div with class including "headlineSummary" for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}): - # divtag contains all article data as ul's and li's # first, check if there is an h3 tag which provides a section name stag = divtag.find('h3') @@ -162,7 +236,7 @@ class WSJ(BasicNewsRecipe): # now skip paid subscriber articles if desired subscriber_tag = litag.find(text="Subscriber Content") if subscriber_tag: - if omit_paid_content: + if self.omit_paid_content: continue # delete the tip div so it doesn't get in the way tiptag = litag.find("div", { "class" : "tipTargetBox" }) @@ -185,7 +259,7 @@ class WSJ(BasicNewsRecipe): continue if url.startswith("/article"): url = mainurl+url - if not url.startswith("http"): + if not url.startswith("http://online.wsj.com"): continue if not url.endswith(".html"): continue @@ -214,48 +288,10 @@ class WSJ(BasicNewsRecipe): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) - # customization notes: delete sections you are not interested in - # set omit_paid_content to False if you want the paid content article previews - sectionlist = ['Front Page','Commentary','World News','US News','Business','Markets', - 'Technology','Personal Finance','Life & Style','Real Estate','Careers','Small Business'] - omit_paid_content = True - if 'Front Page' in sectionlist: - parse_index_page('/home-page','Front Page',omit_paid_content) - ans.append('Front Page') - if 'Commentary' in sectionlist: - parse_index_page('/public/page/news-opinion-commentary.html','Commentary',omit_paid_content) - ans.append('Commentary') - if 'World News' in sectionlist: - parse_index_page('/public/page/news-global-world.html','World News',omit_paid_content) - ans.append('World News') - if 'US News' in sectionlist: - parse_index_page('/public/page/news-world-business.html','US News',omit_paid_content) - ans.append('US News') - if 'Business' in sectionlist: - parse_index_page('/public/page/news-business-us.html','Business',omit_paid_content) - ans.append('Business') - if 'Markets' in sectionlist: - parse_index_page('/public/page/news-financial-markets-stock.html','Markets',omit_paid_content) - ans.append('Markets') - if 'Technology' in sectionlist: - parse_index_page('/public/page/news-tech-technology.html','Technology',omit_paid_content) - ans.append('Technology') - if 'Personal Finance' in sectionlist: - parse_index_page('/public/page/news-personal-finance.html','Personal Finance',omit_paid_content) - ans.append('Personal Finance') - if 'Life & Style' in sectionlist: - parse_index_page('/public/page/news-lifestyle-arts-entertainment.html','Life & Style',omit_paid_content) - ans.append('Life & Style') - if 'Real Estate' in sectionlist: - parse_index_page('/public/page/news-real-estate-homes.html','Real Estate',omit_paid_content) - ans.append('Real Estate') - if 'Careers' in sectionlist: - parse_index_page('/public/page/news-career-jobs.html','Careers',omit_paid_content) - ans.append('Careers') - if 'Small Business' in sectionlist: - parse_index_page('/public/page/news-small-business-marketing.html','Small Business',omit_paid_content) - ans.append('Small Business') + for page_name,page_title in self.sectionlist: + parse_index_page(page_name,page_title) + ans.append(page_title) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans From 88b6d0035ee6398dc941553bc2ead23db1c1ed3b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jan 2010 18:43:55 -0700 Subject: [PATCH 17/32] ... --- src/calibre/gui2/dialogs/metadata_single.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index a9130b2ea2..89b7c92125 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -532,7 +532,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): if self.cover_fetcher.exception is not None: err = self.cover_fetcher.exception error_dialog(self, _('Cannot fetch cover'), - _('Could not fetch cover.
')+repr(err)).exec_() + _('Could not fetch cover.
')+unicode(err)).exec_() return pix = QPixmap() From df019215ca394a89aba0922c909ffd036d50aeb2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jan 2010 18:47:43 -0700 Subject: [PATCH 18/32] Fix #4618 (New recipe for digitaljournal.com) --- resources/images/news/digitaljournal.png | Bin 0 -> 253 bytes resources/recipes/digitaljournal.recipe | 52 +++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 resources/images/news/digitaljournal.png create mode 100644 resources/recipes/digitaljournal.recipe diff --git a/resources/images/news/digitaljournal.png b/resources/images/news/digitaljournal.png new file mode 100644 index 0000000000000000000000000000000000000000..ea4637b8ad5c1d1a0639614d6f4bf0aabb8fb7c2 GIT binary patch literal 253 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*VR95qiD#}JFt$q5St3>Igpz z%C3Yf4S~#VybFXF82qj<%V=$SodDFKTH+c}l9E`GYL#4+3Zxi}3=9o)4GeXS3_=Vo ztW1ooOn_W-D+7b0&fKd|H00)|WTsUjYcR01GB&pY=?7^jE{%2rYGCkm^>bP0l+XkK D#WO;p literal 0 HcmV?d00001 diff --git a/resources/recipes/digitaljournal.recipe b/resources/recipes/digitaljournal.recipe new file mode 100644 index 0000000000..c49caf9580 --- /dev/null +++ b/resources/recipes/digitaljournal.recipe @@ -0,0 +1,52 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +digitaljournal.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class DigitalJournal(BasicNewsRecipe): + title = 'Digital Journal' + __author__ = 'Darko Miletic' + description = 'A Global Citizen Journalism News Network' + category = 'news, politics, USA, world' + publisher = 'Digital Journal' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf8' + language = 'en' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + keep_only_tags = [dict(name='div', attrs={'class':['article','body']})] + + remove_tags = [dict(name=['object','table'])] + + feeds = [ + (u'Latest News' , u'http://digitaljournal.com/rss/?feed=latest_news' ) + ,(u'Business' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Business' ) + ,(u'Entertainment', u'http://digitaljournal.com/rss/?feed=top_news&depname=Entertainment') + ,(u'Environment' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Environment' ) + ,(u'Food' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Food' ) + ,(u'Health' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Health' ) + ,(u'Internet' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Internet' ) + ,(u'Politics' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Politics' ) + ,(u'Religion' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Religion' ) + ,(u'Science' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Science' ) + ,(u'Sports' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Sports' ) + ,(u'Technology' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Technology' ) + ,(u'World' , u'http://digitaljournal.com/rss/?feed=top_news&depname=World' ) + ,(u'Arts' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Arts' ) + ] + + def print_version(self, url): + return url.replace('digitaljournal.com/','digitaljournal.com/print/') + From 5132aba5f0434b231cbfe3d5d02acf64d1433f6c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jan 2010 21:01:43 -0700 Subject: [PATCH 19/32] New recipes for various CanWest Canadian news sources by Nick Redding --- resources/recipes/calgary_herald.recipe | 121 +++++++++++++++ resources/recipes/edmonton_journal.recipe | 126 ++++++++++++++++ resources/recipes/montreal_gazette.recipe | 96 ++++++++++++ resources/recipes/ottawa_citizen.recipe | 101 +++++++++++++ resources/recipes/regina_leader_post.recipe | 116 ++++++++++++++ .../recipes/saskatoon_star_phoenix.recipe | 111 ++++++++++++++ resources/recipes/vancouver_provice.recipe | 136 +++++++++++++++++ resources/recipes/vancouver_sun.recipe | 131 ++++++++++++++++ resources/recipes/vic_times.recipe | 141 ++++++++++++++++++ resources/recipes/windows_star.recipe | 106 +++++++++++++ 10 files changed, 1185 insertions(+) create mode 100644 resources/recipes/calgary_herald.recipe create mode 100644 resources/recipes/edmonton_journal.recipe create mode 100644 resources/recipes/montreal_gazette.recipe create mode 100644 resources/recipes/ottawa_citizen.recipe create mode 100644 resources/recipes/regina_leader_post.recipe create mode 100644 resources/recipes/saskatoon_star_phoenix.recipe create mode 100644 resources/recipes/vancouver_provice.recipe create mode 100644 resources/recipes/vancouver_sun.recipe create mode 100644 resources/recipes/vic_times.recipe create mode 100644 resources/recipes/windows_star.recipe diff --git a/resources/recipes/calgary_herald.recipe b/resources/recipes/calgary_herald.recipe new file mode 100644 index 0000000000..884a951d96 --- /dev/null +++ b/resources/recipes/calgary_herald.recipe @@ -0,0 +1,121 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Calgary Herald + title = u'Calgary Herald' + url_prefix = 'http://www.calgaryherald.com' + description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/edmonton_journal.recipe b/resources/recipes/edmonton_journal.recipe new file mode 100644 index 0000000000..ac28b18f71 --- /dev/null +++ b/resources/recipes/edmonton_journal.recipe @@ -0,0 +1,126 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Edmonton Journal + title = u'Edmonton Journal' + url_prefix = 'http://www.edmontonjournal.com' + description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/montreal_gazette.recipe b/resources/recipes/montreal_gazette.recipe new file mode 100644 index 0000000000..3061cc37e4 --- /dev/null +++ b/resources/recipes/montreal_gazette.recipe @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Montreal Gazette + title = u'Montreal Gazette' + url_prefix = 'http://www.montrealgazette.com' + description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/ottawa_citizen.recipe b/resources/recipes/ottawa_citizen.recipe new file mode 100644 index 0000000000..5465212d4c --- /dev/null +++ b/resources/recipes/ottawa_citizen.recipe @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Ottawa Citizen + title = u'Ottawa Citizen' + url_prefix = 'http://www.ottawacitizen.com' + description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/regina_leader_post.recipe b/resources/recipes/regina_leader_post.recipe new file mode 100644 index 0000000000..9efec51848 --- /dev/null +++ b/resources/recipes/regina_leader_post.recipe @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Regina Leader-Post + title = u'Regina Leader-Post' + url_prefix = 'http://www.leaderpost.com' + description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/saskatoon_star_phoenix.recipe b/resources/recipes/saskatoon_star_phoenix.recipe new file mode 100644 index 0000000000..25330478d4 --- /dev/null +++ b/resources/recipes/saskatoon_star_phoenix.recipe @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Saskatoon Star-Phoenix + title = u'Saskatoon Star-Phoenix' + url_prefix = 'http://www.thestarphoenix.com' + description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/vancouver_provice.recipe b/resources/recipes/vancouver_provice.recipe new file mode 100644 index 0000000000..9375670c59 --- /dev/null +++ b/resources/recipes/vancouver_provice.recipe @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Vancouver Province + title = u'Vancouver Province' + url_prefix = 'http://www.theprovince.com' + description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Vancouver Sun + #title = u'Vancouver Sun' + #url_prefix = 'http://www.vancouversun.com' + #description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Edmonton Journal + #title = u'Edmonton Journal' + #url_prefix = 'http://www.edmontonjournal.com' + #description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/vancouver_sun.recipe b/resources/recipes/vancouver_sun.recipe new file mode 100644 index 0000000000..8f12869bf9 --- /dev/null +++ b/resources/recipes/vancouver_sun.recipe @@ -0,0 +1,131 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Vancouver Sun + title = u'Vancouver Sun' + url_prefix = 'http://www.vancouversun.com' + description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Edmonton Journal + #title = u'Edmonton Journal' + #url_prefix = 'http://www.edmontonjournal.com' + #description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/vic_times.recipe b/resources/recipes/vic_times.recipe new file mode 100644 index 0000000000..2dc8e96003 --- /dev/null +++ b/resources/recipes/vic_times.recipe @@ -0,0 +1,141 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Victoria Times Colonist + title = u'Victoria Times Colonist' + url_prefix = 'http://www.timescolonist.com' + description = u'News from Victoria, BC' + + # un-comment the following three lines for the Vancouver Province + #title = u'Vancouver Province' + #url_prefix = 'http://www.theprovince.com' + #description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Vancouver Sun + #title = u'Vancouver Sun' + #url_prefix = 'http://www.vancouversun.com' + #description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Edmonton Journal + #title = u'Edmonton Journal' + #url_prefix = 'http://www.edmontonjournal.com' + #description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/windows_star.recipe b/resources/recipes/windows_star.recipe new file mode 100644 index 0000000000..4d34261bb7 --- /dev/null +++ b/resources/recipes/windows_star.recipe @@ -0,0 +1,106 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Windsor Star + title = u'Windsor Star' + url_prefix = 'http://www.windsorstar.com' + description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans From 46fa724b0aec92c2538207b6f9cfe9b6e5b833cd Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jan 2010 21:05:26 -0700 Subject: [PATCH 20/32] Fix #4623 (WSJ News Download Fails with Included Recipe) --- resources/recipes/wsj.recipe | 2 ++ 1 file changed, 2 insertions(+) diff --git a/resources/recipes/wsj.recipe b/resources/recipes/wsj.recipe index 70c05b1ded..3b6f56e3a3 100644 --- a/resources/recipes/wsj.recipe +++ b/resources/recipes/wsj.recipe @@ -91,6 +91,8 @@ class WallStreetJournal(BasicNewsRecipe): url = url.partition('#')[0] desc = '' d = x.findNextSibling(True) + if d is None: + continue if d.get('class', None) == 'arialResize': desc = self.tag_to_string(d) desc = desc.partition(u'\u2022')[0] From 419b7e42b371c0d30fba326e6e9e29d44db0c7c1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jan 2010 21:15:12 -0700 Subject: [PATCH 21/32] ... --- resources/recipes/wsj.recipe | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/resources/recipes/wsj.recipe b/resources/recipes/wsj.recipe index 3b6f56e3a3..da65471e60 100644 --- a/resources/recipes/wsj.recipe +++ b/resources/recipes/wsj.recipe @@ -91,9 +91,7 @@ class WallStreetJournal(BasicNewsRecipe): url = url.partition('#')[0] desc = '' d = x.findNextSibling(True) - if d is None: - continue - if d.get('class', None) == 'arialResize': + if d is not None and d.get('class', None) == 'arialResize': desc = self.tag_to_string(d) desc = desc.partition(u'\u2022')[0] self.log('\t\tFound article:', title) From ae2b434b35cafd9a42a2d10d3093ba1435e2ff2e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jan 2010 21:54:34 -0700 Subject: [PATCH 22/32] ... --- resources/recipes/wsj.recipe | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/resources/recipes/wsj.recipe b/resources/recipes/wsj.recipe index da65471e60..3ced77023d 100644 --- a/resources/recipes/wsj.recipe +++ b/resources/recipes/wsj.recipe @@ -5,6 +5,7 @@ __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe +from calibre import strftime # http://online.wsj.com/page/us_in_todays_paper.html @@ -67,6 +68,13 @@ class WallStreetJournal(BasicNewsRecipe): def parse_index(self): soup = self.wsj_get_index() + year = strftime('%Y') + for x in soup.findAll('td', attrs={'class':'b14'}): + txt = self.tag_to_string(x).strip() + if year in txt: + self.timefmt = ' [%s]'%txt + break + left_column = soup.find( text=lambda t: 'begin ITP Left Column' in str(t)) From 1e12cfeb744de198c51d0dc5d4f72a8b669b3b30 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jan 2010 23:16:30 -0700 Subject: [PATCH 23/32] Fix #4606 (Viewer crash on MOBI files created from PDB) --- resources/recipes/globe_and_mail.recipe | 2 +- src/calibre/gui2/viewer/documentview.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/resources/recipes/globe_and_mail.recipe b/resources/recipes/globe_and_mail.recipe index 71d6b2b304..0ef8bd9dd8 100644 --- a/resources/recipes/globe_and_mail.recipe +++ b/resources/recipes/globe_and_mail.recipe @@ -32,7 +32,7 @@ class GlobeAndMail(BasicNewsRecipe): 'gallery-controls', 'video', 'galleryLoading','deck','header', 'toolsBottom'] }, {'class':['credit','inline-img-caption','tab-pointer'] }, - dict(name='div', attrs={'id':'lead-photo'}), + dict(name='div', attrs={'id':['lead-photo', 'most-popular-story']}), dict(name='div', attrs={'class':'right'}), dict(name='div', attrs={'id':'footer'}), dict(name='div', attrs={'id':'beta-msg'}), diff --git a/src/calibre/gui2/viewer/documentview.py b/src/calibre/gui2/viewer/documentview.py index 790b1c4f2f..aedd709bb8 100644 --- a/src/calibre/gui2/viewer/documentview.py +++ b/src/calibre/gui2/viewer/documentview.py @@ -350,7 +350,13 @@ class Document(QWebPage): return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results def set_bottom_padding(self, amount): - self.javascript('$("body").css("padding-bottom", "%dpx")' % amount) + padding = '%dpx'%amount + try: + old_padding = unicode(self.javascript('$("body").css("padding-bottom")').toString()) + except: + old_padding = '' + if old_padding != padding: + self.javascript('$("body").css("padding-bottom", "%s")' % padding) class EntityDeclarationProcessor(object): From 7d2a8dd624506a63ef080fd8746c5fe84a56919f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jan 2010 09:29:20 -0700 Subject: [PATCH 24/32] ... --- resources/recipes/ftd.recipe | 2 ++ 1 file changed, 2 insertions(+) diff --git a/resources/recipes/ftd.recipe b/resources/recipes/ftd.recipe index d18f9bdc56..67eb4d08b1 100644 --- a/resources/recipes/ftd.recipe +++ b/resources/recipes/ftd.recipe @@ -23,6 +23,7 @@ class FTDe(BasicNewsRecipe): dict(id='topbanner'), dict(id='seitenkopf'), dict(id='BoxA-0-0-0'), + #dict(id='BoxA-2-0-0'), dict(id='footer'), dict(id='rating_open'), dict(id='ADS_Top'), @@ -59,6 +60,7 @@ class FTDe(BasicNewsRecipe): dict(name='div', attrs={'class':'relatedhalb'}), dict(name='div', attrs={'class':'box boxListScrollOutline'}), dict(name='div', attrs={'class':'box boxPhotoshow boxImgWide'}), + dict(name='div', attrs={'class':'box boxTeaser boxPhotoshow boxImgWide'}), dict(name='div', attrs={'class':'box boxTeaser'}), dict(name='div', attrs={'class':'tagCloud'}), dict(name='div', attrs={'class':'pollView'}), From d61af79c8c0af73e1e8999d9e5e7cfa262d6d299 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jan 2010 10:08:24 -0700 Subject: [PATCH 25/32] ... --- resources/images/news/ledevoir.png | Bin 0 -> 531 bytes src/calibre/ebooks/pdf/reflow.py | 12 ++++++++++++ 2 files changed, 12 insertions(+) create mode 100644 resources/images/news/ledevoir.png diff --git a/resources/images/news/ledevoir.png b/resources/images/news/ledevoir.png new file mode 100644 index 0000000000000000000000000000000000000000..eabcf9700489650323185e7f22af2083bb97bfc4 GIT binary patch literal 531 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87#M9lT^vI!P9L3Y=Sk` zMbUh_ZcO2v^|$qp`#0uQis_!m@3^>|yRH;q4v#*v?!#W*GROCOTOM3!G@f(#^WOWF zmiNUYb@m-$O!Q`-d)(~U8xyu>#@=JbpMTErddhaabf?}_i#|7JRvpirbhXKeHgcEq zwrfs5z2tIc-s;e{!-{obK>-PqrAs}R1ZjTyDYNyK?CGaQ8VNlI=Whu7@G3^{e1^%Q zn>lPZE^zMo@XAH0F*UNpYCWS_3HNO=m4`BxH@2NEu{wJGwpvN=eleAvmY(A-A3oRb zW6p27nsu~lui8u-UO$EV);BgL@EEn-e9q%3!ZrQJ&%HV&yJOFtJGWCy?EB^ir7DIq zeO`R8y7KF>C&$KUUNB+2lQUiY58^?%;y|6!C!)4XTA zd0ja$P*qD@BT7;dOH!?pi&B9UgOP!ufv$m}u90Dgp@Efw84&3jm|Gbb Date: Thu, 21 Jan 2010 10:45:29 -0700 Subject: [PATCH 26/32] Fix #4629 (Nook can't open files with "#" in name) --- src/calibre/devices/nook/driver.py | 3 ++- src/calibre/devices/usbms/device.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/nook/driver.py b/src/calibre/devices/nook/driver.py index c74a964648..16bf9479d8 100644 --- a/src/calibre/devices/nook/driver.py +++ b/src/calibre/devices/nook/driver.py @@ -86,4 +86,5 @@ class NOOK(USBMS): return drives - + def sanitize_path_components(self, components): + return [x.replace('#', '_') for x in components] diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index ab91de2abf..6ddfc81cf3 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -782,6 +782,13 @@ class Device(DeviceConfig, DevicePlugin): ''' return default + def sanitize_path_components(self, components): + ''' + Perform any device specific sanitization on the path components + for files to be uploaded to the device + ''' + return components + def create_upload_path(self, path, mdata, fname): path = os.path.abspath(path) extra_components = [] @@ -834,6 +841,7 @@ class Device(DeviceConfig, DevicePlugin): extra_components = list(map(remove_trailing_periods, extra_components)) components = shorten_components_to(250 - len(path), extra_components) + components = self.sanitize_path_components(components) filepath = os.path.join(path, *components) filedir = os.path.dirname(filepath) From 69c10e202c6b2a4db0d9148dd0ac30cb1cce4d6a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jan 2010 10:54:25 -0700 Subject: [PATCH 27/32] New recipe for The Columbia Hournalism Review by XanthanGum --- resources/recipes/cjr.recipe | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 resources/recipes/cjr.recipe diff --git a/resources/recipes/cjr.recipe b/resources/recipes/cjr.recipe new file mode 100644 index 0000000000..d581184c4e --- /dev/null +++ b/resources/recipes/cjr.recipe @@ -0,0 +1,15 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class CJR(BasicNewsRecipe): + title = u'Columbia Journalism Review' + __author__ = u'Xanthan Gum' + description = 'News about journalism.' + language = 'en' + + oldest_article = 7 + max_articles_per_feed = 100 + + feeds = [(u'News Stories', u'http://www.cjr.org/index.xml')] + + def print_version(self, url): + return url + '?page=all&print=true' From 3e69d4c2aa3cf34d61e72b48b0e11dbc6edf83f3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jan 2010 11:28:45 -0700 Subject: [PATCH 28/32] News downloads: When getting an article URL from a RSS feed, look first for an original article link. This speeds up the download of news services that use a syndication service like feedburner or pheedo to publish their RSS feeds. --- src/calibre/web/feeds/news.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index dfcadf03ed..60b5ad0174 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -357,9 +357,17 @@ class BasicNewsRecipe(Recipe): Override in a subclass to customize extraction of the :term:`URL` that points to the content for each article. Return the article URL. It is called with `article`, an object representing a parsed article - from a feed. See `feedsparser `_. - By default it returns `article.link `_. + from a feed. See `feedparser `_. + By default it looks for the original link (for feeds syndicated via a + service like feedburner or pheedo) and if found, + returns that or else returns + `article.link `_. ''' + for key in article.keys(): + if key.endswith('_origlink'): + url = article[key] + if url and url.startswith('http://'): + return url return article.get('link', None) def preprocess_html(self, soup): From b3282b3ac569d9dac1f80bff98a641fe0ed18839 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jan 2010 13:11:37 -0700 Subject: [PATCH 29/32] ... --- resources/recipes/wsj_free.recipe | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/resources/recipes/wsj_free.recipe b/resources/recipes/wsj_free.recipe index 495a7c343b..b190f43849 100644 --- a/resources/recipes/wsj_free.recipe +++ b/resources/recipes/wsj_free.recipe @@ -8,7 +8,7 @@ online.wsj.com import re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag, NavigableString -from datetime import timedelta, datetime, date +from datetime import timedelta, date class WSJ(BasicNewsRecipe): # formatting adapted from original recipe by Kovid Goyal and Sujata Raman @@ -74,16 +74,33 @@ class WSJ(BasicNewsRecipe): br = BasicNewsRecipe.get_browser() return br + def preprocess_html(self,soup): + + def decode_us_date(datestr): + udate = datestr.strip().lower().split() + m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(udate[0])+1 + d = int(udate[1]) + y = int(udate[2]) + return date(y,m,d) + + # check if article is paid content + if self.omit_paid_content: + divtags = soup.findAll('div','tooltip') + if divtags: + for divtag in divtags: + if divtag.find(text="Subscriber Content"): + return None + # check if article is too old datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")}) if datetag: dateline_string = self.tag_to_string(datetag,False) date_items = dateline_string.split(',') datestring = date_items[0]+date_items[1] - article_date = datetime.strptime(datestring.title(),"%B %d %Y") + article_date = decode_us_date(datestring) earliest_date = date.today() - timedelta(days=self.oldest_article) - if article_date.date() < earliest_date: + if article_date < earliest_date: self.log("Skipping article dated %s" % datestring) return None datetag.parent.extract() From e2580655d1acf538a2b746d7d9ada517a6734c1a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jan 2010 14:00:51 -0700 Subject: [PATCH 30/32] Development environment: First look for resources in the location pointed to by CALIBRE_DEVELOP_FROM. If not found, use the normal resource location --- src/calibre/gui2/tag_view.py | 2 +- src/calibre/utils/resources.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/tag_view.py b/src/calibre/gui2/tag_view.py index 0f02f2a591..8ad0dff4d2 100644 --- a/src/calibre/gui2/tag_view.py +++ b/src/calibre/gui2/tag_view.py @@ -215,7 +215,7 @@ class TagsModel(QAbstractItemModel): return QModelIndex() child_item = index.internalPointer() - parent_item = child_item.parent + parent_item = getattr(child_item, 'parent', None) if parent_item is self.root_item or parent_item is None: return QModelIndex() diff --git a/src/calibre/utils/resources.py b/src/calibre/utils/resources.py index adfbebd9f0..a69db34f2e 100644 --- a/src/calibre/utils/resources.py +++ b/src/calibre/utils/resources.py @@ -9,9 +9,22 @@ __docformat__ = 'restructuredtext en' import __builtin__, sys, os +_dev_path = os.environ.get('CALIBRE_DEVELOP_FROM', None) +if _dev_path is not None: + _dev_path = os.path.join(os.path.abspath(os.path.dirname(_dev_path)), 'resources') + if not os.path.exists(_dev_path): + _dev_path = None + def get_path(path, data=False): + global _dev_path path = path.replace(os.sep, '/') - path = os.path.join(sys.resources_location, *path.split('/')) + base = None + if _dev_path is not None: + if os.path.exists(os.path.join(_dev_path, *path.split('/'))): + base = _dev_path + if base is None: + base = sys.resources_location + path = os.path.join(base, *path.split('/')) if data: return open(path, 'rb').read() return path From 3c084bb83e6c67f06992052c0493a45262781724 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jan 2010 14:09:56 -0700 Subject: [PATCH 31/32] Fix #4428 (Became a new user today. Calibre does not "see" my Blackberry 8330) --- src/calibre/devices/blackberry/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/blackberry/driver.py b/src/calibre/devices/blackberry/driver.py index 1d96d4118f..ec8a7e8f49 100644 --- a/src/calibre/devices/blackberry/driver.py +++ b/src/calibre/devices/blackberry/driver.py @@ -18,7 +18,7 @@ class BLACKBERRY(USBMS): VENDOR_ID = [0x0fca] PRODUCT_ID = [0x8004, 0x0004] - BCD = [0x0200, 0x0107] + BCD = [0x0200, 0x0107, 0x0201] VENDOR_NAME = 'RIM' WINDOWS_MAIN_MEM = 'BLACKBERRY_SD' From 24a6d43b9115a4f6d4bb451c31158b0bf4618186 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Jan 2010 14:50:39 -0700 Subject: [PATCH 32/32] Improve handling of justification. Now calibre will explicitly change the justification of all left aligned paragraphs to justified or vice versa depending on the justification setting. This should make it possible to robustly convert all content to either justified or not. calibre will not touch centered or right aligned content. --- src/calibre/ebooks/fb2/fb2ml.py | 5 +- src/calibre/ebooks/lit/output.py | 2 +- src/calibre/ebooks/lit/writer.py | 13 ++- src/calibre/ebooks/mobi/mobiml.py | 3 +- src/calibre/ebooks/oeb/factory.py | 99 ------------------- src/calibre/ebooks/oeb/stylizer.py | 19 +++- src/calibre/ebooks/oeb/transforms/flatcss.py | 2 +- .../ebooks/oeb/transforms/manglecase.py | 15 +-- .../ebooks/oeb/transforms/rasterize.py | 3 +- src/calibre/ebooks/pml/pmlml.py | 3 +- src/calibre/ebooks/rb/rbml.py | 5 +- src/calibre/ebooks/rtf/rtfml.py | 5 +- src/calibre/ebooks/txt/txtml.py | 2 +- 13 files changed, 49 insertions(+), 127 deletions(-) delete mode 100644 src/calibre/ebooks/oeb/factory.py diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 42feeb2330..c8428cf136 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -132,7 +132,8 @@ class FB2MLizer(object): href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, + self.opts, self.opts.output_profile) output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output @@ -152,7 +153,7 @@ class FB2MLizer(object): text = [] for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) text.append(self.add_page_anchor(item)) text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) return ''.join(text) diff --git a/src/calibre/ebooks/lit/output.py b/src/calibre/ebooks/lit/output.py index 2a08ff51a8..423fb9ce7c 100644 --- a/src/calibre/ebooks/lit/output.py +++ b/src/calibre/ebooks/lit/output.py @@ -32,7 +32,7 @@ class LITOutput(OutputFormatPlugin): mangler(oeb, opts) rasterizer = SVGRasterizer() rasterizer(oeb, opts) - lit = LitWriter() + lit = LitWriter(self.opts) lit(oeb, output_path) diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py index 6dd5068032..cf9ea6aa77 100644 --- a/src/calibre/ebooks/lit/writer.py +++ b/src/calibre/ebooks/lit/writer.py @@ -134,7 +134,7 @@ def warn(x): class ReBinary(object): NSRMAP = {'': None, XML_NS: 'xml'} - def __init__(self, root, item, oeb, map=HTML_MAP): + def __init__(self, root, item, oeb, opts, map=HTML_MAP): self.item = item self.logger = oeb.logger self.manifest = oeb.manifest @@ -143,7 +143,7 @@ class ReBinary(object): self.anchors = [] self.page_breaks = [] self.is_html = is_html = map is HTML_MAP - self.stylizer = Stylizer(root, item.href, oeb) if is_html else None + self.stylizer = Stylizer(root, item.href, oeb, opts) if is_html else None self.tree_to_binary(root) self.content = self.buf.getvalue() self.ahc = self.build_ahc() if is_html else None @@ -295,9 +295,8 @@ def preserve(function): return wrapper class LitWriter(object): - def __init__(self): - # Wow, no options - pass + def __init__(self, opts): + self.opts = opts def _litize_oeb(self): oeb = self._oeb @@ -469,7 +468,7 @@ class LitWriter(object): secnum = 0 if isinstance(data, etree._Element): self._add_folder(name) - rebin = ReBinary(data, item, self._oeb, map=HTML_MAP) + rebin = ReBinary(data, item, self._oeb, self.opts, map=HTML_MAP) self._add_file(name + '/ahc', rebin.ahc, 0) self._add_file(name + '/aht', rebin.aht, 0) item.page_breaks = rebin.page_breaks @@ -562,7 +561,7 @@ class LitWriter(object): meta.attrib['ms--minimum_level'] = '0' meta.attrib['ms--attr5'] = '1' meta.attrib['ms--guid'] = '{%s}' % str(uuid.uuid4()).upper() - rebin = ReBinary(meta, None, self._oeb, map=OPF_MAP) + rebin = ReBinary(meta, None, self._oeb, self.opts, map=OPF_MAP) meta = rebin.content self._meta = meta self._add_file('/meta', meta) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index aa69ba446b..f958b63a12 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -92,6 +92,7 @@ class MobiMLizer(object): def __call__(self, oeb, context): oeb.logger.info('Converting XHTML to Mobipocket markup...') self.oeb = oeb + self.opts = context self.profile = profile = context.dest self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items()) self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys()) @@ -114,7 +115,7 @@ class MobiMLizer(object): def mobimlize_spine(self): 'Iterate over the spine and convert it to MOBIML' for item in self.oeb.spine: - stylizer = Stylizer(item.data, item.href, self.oeb, self.profile) + stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile) body = item.data.find(XHTML('body')) nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) nbody = etree.SubElement(nroot, XHTML('body')) diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py deleted file mode 100644 index 8add71d20d..0000000000 --- a/src/calibre/ebooks/oeb/factory.py +++ /dev/null @@ -1,99 +0,0 @@ -''' -Registry associating file extensions with Reader classes. -''' -from __future__ import with_statement - -__license__ = 'GPL v3' -__copyright__ = '2008, Marshall T. Vandegrift ' - -import sys, os, logging -from itertools import chain -import calibre -from calibre.ebooks.oeb.base import OEBError -from calibre.ebooks.oeb.reader import OEBReader -from calibre.ebooks.oeb.writer import OEBWriter -from calibre.ebooks.lit.reader import LitReader -from calibre.ebooks.lit.writer import LitWriter -from calibre.ebooks.mobi.reader import MobiReader -from calibre.ebooks.mobi.writer import MobiWriter -from calibre.ebooks.oeb.base import OEBBook -from calibre.ebooks.oeb.profile import Context -from calibre.utils.config import Config - -__all__ = ['get_reader'] - -REGISTRY = { - '.opf': (OEBReader, None), - '.lit': (LitReader, LitWriter), - '.mobi': (MobiReader, MobiWriter), - } - -def ReaderFactory(path): - if os.path.isdir(path): - return OEBReader - ext = os.path.splitext(path)[1].lower() - Reader = REGISTRY.get(ext, (None, None))[0] - if Reader is None: - raise OEBError('Unknown e-book file extension %r' % ext) - return Reader - -def WriterFactory(path): - if os.path.isdir(path): - return OEBWriter - ext = os.path.splitext(path)[1].lower() - if not os.path.exists(path) and not ext: - return OEBWriter - Writer = REGISTRY.get(ext, (None, None))[1] - if Writer is None: - raise OEBError('Unknown e-book file extension %r' % ext) - return Writer - - -def option_parser(Reader, Writer): - cfg = Config('ebook-convert', _('Options to control e-book conversion.')) - Reader.config(cfg) - for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS): - Transform.config(cfg) - Writer.config(cfg) - parser = cfg.option_parser() - parser.add_option('--encoding', default=None, - help=_('Character encoding for input. Default is to auto detect.')) - parser.add_option('-o', '--output', default=None, - help=_('Output file. Default is derived from input filename.')) - parser.add_option('-p', '--pretty-print', action='store_true', - default=False, help=_('Produce more human-readable XML output.')) - parser.add_option('-v', '--verbose', default=0, action='count', - help=_('Useful for debugging.')) - return parser - -def main(argv=sys.argv): - if len(argv) < 3: - print _("Usage: ebook-convert INFILE OUTFILE [OPTIONS..]") - return 1 - inpath, outpath = argv[1], argv[2] - Reader = ReaderFactory(inpath) - Writer = WriterFactory(outpath) - parser = option_parser(Reader, Writer) - opts, args = parser.parse_args(argv[3:]) - if len(args) != 0: - parser.print_help() - return 1 - logger = logging.getLogger('ebook-convert') - calibre.setup_cli_handlers(logger, logging.DEBUG) - encoding = opts.encoding - pretty_print = opts.pretty_print - oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) - context = Context(Reader.DEFAULT_PROFILE, Writer.DEFAULT_PROFILE) - reader = Reader.generate(opts) - writer = Writer.generate(opts) - transforms = [] - for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS): - transforms.append(Transform.generate(opts)) - reader(oeb, inpath) - for transform in transforms: - transform(oeb, context) - writer(oeb, outpath) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index d0e394b9e5..26fb4ca980 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -110,9 +110,9 @@ class CSSSelector(etree.XPath): class Stylizer(object): STYLESHEETS = WeakKeyDictionary() - def __init__(self, tree, path, oeb, profile=PROFILES['PRS505'], + def __init__(self, tree, path, oeb, opts, profile=PROFILES['PRS505'], extra_css='', user_css=''): - self.oeb = oeb + self.oeb, self.opts = oeb, opts self.profile = profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] @@ -249,6 +249,8 @@ class Stylizer(object): style.update(self._normalize_font(prop.cssValue)) elif name == 'list-style': style.update(self._normalize_list_style(prop.cssValue)) + elif name == 'text-align': + style.update(self._normalize_text_align(prop.cssValue)) else: style[name] = prop.value if 'font-size' in style: @@ -306,6 +308,19 @@ class Stylizer(object): return style + def _normalize_text_align(self, cssvalue): + style = {} + text = cssvalue.cssText + if text == 'inherit': + style['text-align'] = 'inherit' + else: + if text in ('left', 'justify'): + val = 'left' if self.opts.dont_justify else 'justify' + style['text-align'] = val + else: + style['text-align'] = text + return style + def _normalize_font(self, cssvalue): composition = ('font-style', 'font-variant', 'font-weight', 'font-size', 'line-height', 'font-family') diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 61226ca4f4..1eb6afc1b5 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -141,7 +141,7 @@ class CSSFlattener(object): bs.append('text-align: '+ \ ('left' if self.context.dont_justify else 'justify')) body.set('style', '; '.join(bs)) - stylizer = Stylizer(html, item.href, self.oeb, profile, + stylizer = Stylizer(html, item.href, self.oeb, self.context, profile, user_css=self.context.extra_css, extra_css=css) self.stylizers[item] = stylizer diff --git a/src/calibre/ebooks/oeb/transforms/manglecase.py b/src/calibre/ebooks/oeb/transforms/manglecase.py index 4b852db6c4..04bf63ac1d 100644 --- a/src/calibre/ebooks/oeb/transforms/manglecase.py +++ b/src/calibre/ebooks/oeb/transforms/manglecase.py @@ -29,13 +29,14 @@ class CaseMangler(object): @classmethod def generate(cls, opts): return cls() - + def __call__(self, oeb, context): oeb.logger.info('Applying case-transforming CSS...') self.oeb = oeb + self.opts = context self.profile = context.source self.mangle_spine() - + def mangle_spine(self): id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css') self.oeb.manifest.add(id, href, CSS_MIME, data=CASE_MANGLER_CSS) @@ -44,9 +45,9 @@ class CaseMangler(object): relhref = item.relhref(href) etree.SubElement(html.find(XHTML('head')), XHTML('link'), rel='stylesheet', href=relhref, type=CSS_MIME) - stylizer = Stylizer(html, item.href, self.oeb, self.profile) + stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile) self.mangle_elem(html.find(XHTML('body')), stylizer) - + def text_transform(self, transform, text): if transform == 'capitalize': return text.title() @@ -55,7 +56,7 @@ class CaseMangler(object): elif transform == 'lowercase': return text.lower() return text - + def split_text(self, text): results = [''] isupper = text[0].isupper() @@ -66,7 +67,7 @@ class CaseMangler(object): isupper = not isupper results.append(char) return results - + def smallcaps_elem(self, elem, attr): texts = self.split_text(getattr(elem, attr)) setattr(elem, attr, None) @@ -90,7 +91,7 @@ class CaseMangler(object): last.tail = tail child.tail = None last = child - + def mangle_elem(self, elem, stylizer): if not isinstance(elem.tag, basestring) or \ namespace(elem.tag) != XHTML_NS: diff --git a/src/calibre/ebooks/oeb/transforms/rasterize.py b/src/calibre/ebooks/oeb/transforms/rasterize.py index 30357b10d2..ac28e51b15 100644 --- a/src/calibre/ebooks/oeb/transforms/rasterize.py +++ b/src/calibre/ebooks/oeb/transforms/rasterize.py @@ -44,6 +44,7 @@ class SVGRasterizer(object): def __call__(self, oeb, context): oeb.logger.info('Rasterizing SVG images...') self.oeb = oeb + self.opts = context self.profile = context.dest self.images = {} self.dataize_manifest() @@ -102,7 +103,7 @@ class SVGRasterizer(object): def rasterize_spine(self): for item in self.oeb.spine: html = item.data - stylizer = Stylizer(html, item.href, self.oeb, self.profile) + stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile) self.rasterize_item(item, stylizer) def rasterize_item(self, item, stylizer): diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 4f3d5f23df..e3609fcddb 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -113,7 +113,8 @@ class PMLMLizer(object): href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, + self.opts, self.opts.output_profile) output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output diff --git a/src/calibre/ebooks/rb/rbml.py b/src/calibre/ebooks/rb/rbml.py index 5574aa94b6..50153d7d4d 100644 --- a/src/calibre/ebooks/rb/rbml.py +++ b/src/calibre/ebooks/rb/rbml.py @@ -90,7 +90,8 @@ class RBMLizer(object): href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, + self.opts, self.opts.output_profile) output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output @@ -111,7 +112,7 @@ class RBMLizer(object): output = [u''] for item in self.oeb_book.spine: self.log.debug('Converting %s to RocketBook HTML...' % item.href) - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) output.append(self.add_page_anchor(item)) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) return ''.join(output) diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index 6aa48ad61b..1217482823 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -111,12 +111,13 @@ class RTFMLizer(object): href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, + self.opts, self.opts.output_profile) output += self.dump_text(item.data.find(XHTML('body')), stylizer) output += '{\\page } ' for item in self.oeb_book.spine: self.log.debug('Converting %s to RTF markup...' % item.href) - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) output += self.dump_text(item.data.find(XHTML('body')), stylizer) output += self.footer() output = self.insert_images(output) diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 7642e051fe..bb730c0720 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -54,7 +54,7 @@ class TXTMLizer(object): output.append(self.get_toc()) for item in self.oeb_book.spine: self.log.debug('Converting %s to TXT...' % item.href) - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) content = self.remove_newlines(content) output += self.dump_text(etree.fromstring(content), stylizer)