diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py
index 4ae0278133..32aeba9122 100644
--- a/resources/default_tweaks.py
+++ b/resources/default_tweaks.py
@@ -69,9 +69,12 @@ categories_use_field_for_author_name = 'author'
# avg_rating: the averate rating of all the books referencing this item
# sort: the sort value. For authors, this is the author_sort for that author
# category: the category (e.g., authors, series) that the item is in.
-categories_collapsed_name_template = '{first.sort:shorten(4,'',0)} - {last.sort:shorten(4,'',0)}'
-categories_collapsed_rating_template = '{first.avg_rating:4.2f:ifempty(0)} - {last.avg_rating:4.2f:ifempty(0)}'
-categories_collapsed_popularity_template = '{first.count:d} - {last.count:d}'
+# Note that the "r'" in front of the { is necessary if there are backslashes
+# (\ characters) in the template. It doesn't hurt anything to leave it there
+# even if there aren't any backslashes.
+categories_collapsed_name_template = r'{first.sort:shorten(4,'',0)} - {last.sort:shorten(4,'',0)}'
+categories_collapsed_rating_template = r'{first.avg_rating:4.2f:ifempty(0)} - {last.avg_rating:4.2f:ifempty(0)}'
+categories_collapsed_popularity_template = r'{first.count:d} - {last.count:d}'
# Set whether boolean custom columns are two- or three-valued.
diff --git a/resources/images/news/arabian_business.png b/resources/images/news/arabian_business.png
new file mode 100644
index 0000000000..e949830988
Binary files /dev/null and b/resources/images/news/arabian_business.png differ
diff --git a/resources/recipes/arabian_business.recipe b/resources/recipes/arabian_business.recipe
new file mode 100644
index 0000000000..8b41c99e68
--- /dev/null
+++ b/resources/recipes/arabian_business.recipe
@@ -0,0 +1,86 @@
+__license__ = 'GPL v3'
+__copyright__ = '2011, Darko Miletic '
+'''
+www.arabianbusiness.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Arabian_Business(BasicNewsRecipe):
+ title = 'Arabian Business'
+ __author__ = 'Darko Miletic'
+ description = 'Comprehensive Guide to Middle East Business & Gulf Industry News including,Banking & Finance,Construction,Energy,Media & Marketing,Real Estate,Transportation,Travel,Technology,Politics,Healthcare,Lifestyle,Jobs & UAE guide.Top Gulf & Dubai Business News.'
+ publisher = 'Arabian Business Publishing Ltd.'
+ category = 'ArabianBusiness.com,Arab Business News,Middle East Business News,Middle East Business,Arab Media News,Industry Events,Middle East Industry News,Arab Business Industry,Dubai Business News,Financial News,UAE Business News,Middle East Press Releases,Gulf News,Arab News,GCC Business News,Banking Finance,Media Marketing,Construction,Oil Gas,Retail,Transportation,Travel Hospitality,Photos,Videos,Life Style,Fashion,United Arab Emirates,UAE,Dubai,Sharjah,Abu Dhabi,Qatar,KSA,Saudi Arabia,Bahrain,Kuwait,Oman,Europe,South Asia,America,Asia,news'
+ oldest_article = 2
+ max_articles_per_feed = 200
+ no_stylesheets = True
+ encoding = 'utf8'
+ use_embedded_content = False
+ language = 'en'
+ remove_empty_feeds = True
+ publication_type = 'newsportal'
+ masthead_url = 'http://www.arabianbusiness.com/skins/ab.main/gfx/arabianbusiness_logo_sm.gif'
+ extra_css = """
+ body{font-family: Georgia,serif }
+ img{margin-bottom: 0.4em; margin-top: 0.4em; display:block}
+ .byline,.dateline{font-size: small; display: inline; font-weight: bold}
+ ul{list-style: none outside none;}
+ """
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
+ }
+
+ remove_tags_before=dict(attrs={'id':'article-title'})
+ remove_tags = [
+ dict(name=['meta','link','base','iframe','embed','object'])
+ ,dict(attrs={'class':'printfooter'})
+ ]
+ remove_attributes=['lang']
+
+
+ feeds = [
+ (u'Africa' , u'http://www.arabianbusiness.com/world/Africa/?service=rss' )
+ ,(u'Americas' , u'http://www.arabianbusiness.com/world/americas/?service=rss' )
+ ,(u'Asia Pacific' , u'http://www.arabianbusiness.com/world/asia-pacific/?service=rss' )
+ ,(u'Europe' , u'http://www.arabianbusiness.com/world/europe/?service=rss' )
+ ,(u'Middle East' , u'http://www.arabianbusiness.com/world/middle-east/?service=rss' )
+ ,(u'South Asia' , u'http://www.arabianbusiness.com/world/south-asia/?service=rss' )
+ ,(u'Banking & Finance', u'http://www.arabianbusiness.com/industries/banking-finance/?service=rss' )
+ ,(u'Construction' , u'http://www.arabianbusiness.com/industries/construction/?service=rss' )
+ ,(u'Education' , u'http://www.arabianbusiness.com/industries/education/?service=rss' )
+ ,(u'Energy' , u'http://www.arabianbusiness.com/industries/energy/?service=rss' )
+ ,(u'Healthcare' , u'http://www.arabianbusiness.com/industries/healthcare/?service=rss' )
+ ,(u'Media' , u'http://www.arabianbusiness.com/industries/media/?service=rss' )
+ ,(u'Real Estate' , u'http://www.arabianbusiness.com/industries/real-estate/?service=rss' )
+ ,(u'Retail' , u'http://www.arabianbusiness.com/industries/retail/?service=rss' )
+ ,(u'Technology' , u'http://www.arabianbusiness.com/industries/technology/?service=rss' )
+ ,(u'Transport' , u'http://www.arabianbusiness.com/industries/transport/?service=rss' )
+ ,(u'Travel' , u'http://www.arabianbusiness.com/industries/travel-hospitality/?service=rss')
+ ,(u'Equities' , u'http://www.arabianbusiness.com/markets/equities/?service=rss' )
+ ,(u'Commodities' , u'http://www.arabianbusiness.com/markets/commodities/?service=rss' )
+ ,(u'Currencies' , u'http://www.arabianbusiness.com/markets/currencies/?service=rss' )
+ ,(u'Market Data' , u'http://www.arabianbusiness.com/markets/market-data/?service=rss' )
+ ,(u'Comment' , u'http://www.arabianbusiness.com/opinion/comment/?service=rss' )
+ ,(u'Think Tank' , u'http://www.arabianbusiness.com/opinion/think-tank/?service=rss' )
+ ,(u'Arts' , u'http://www.arabianbusiness.com/lifestyle/arts/?service=rss' )
+ ,(u'Cars' , u'http://www.arabianbusiness.com/lifestyle/cars/?service=rss' )
+ ,(u'Food' , u'http://www.arabianbusiness.com/lifestyle/food/?service=rss' )
+ ,(u'Sport' , u'http://www.arabianbusiness.com/lifestyle/sport/?service=rss' )
+ ]
+
+ def print_version(self, url):
+ return url + '?service=printer&page='
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ for alink in soup.findAll('a'):
+ if alink.string is not None:
+ tstr = alink.string
+ alink.replaceWith(tstr)
+ return soup
diff --git a/resources/recipes/atlantic.recipe b/resources/recipes/atlantic.recipe
index 5ae0f7d993..daf73aebdc 100644
--- a/resources/recipes/atlantic.recipe
+++ b/resources/recipes/atlantic.recipe
@@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal '
'''
theatlantic.com
'''
-import string, re
+import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
@@ -33,25 +33,27 @@ class TheAtlantic(BasicNewsRecipe):
articles = []
soup = self.index_to_soup(self.INDEX)
- sectit = soup.find('h1', attrs={'class':'sectionTitle'})
- if sectit is not None:
- texts = self.tag_to_string(sectit).strip().split()[-2:]
- if texts:
- self.timefmt = ' [%s]'%(' '.join(texts))
+ ts = soup.find(id='magazineTopStories')
+ ds = self.tag_to_string(ts.find('h1')).split(':')[-1]
+ self.timefmt = ' [%s]'%ds
cover = soup.find('img', src=True, attrs={'class':'cover'})
if cover is not None:
self.cover_url = cover['src']
feeds = []
+ seen_titles = set([])
for section in soup.findAll('div', attrs={'class':'magazineSection'}):
- section_title = section.find(attrs={'class':'sectionHeader'})
- section_title = string.capwords(self.tag_to_string(section_title))
+ section_title = self.tag_to_string(section.find('h2'))
self.log('Found section:', section_title)
articles = []
- for post in section.findAll('div', attrs={'class':'post'}):
+ for post in section.findAll('div', attrs={'class':lambda x : x and
+ 'post' in x}):
h = post.find(['h3', 'h4'])
title = self.tag_to_string(h)
+ if title in seen_titles:
+ continue
+ seen_titles.add(title)
a = post.find('a', href=True)
url = a['href']
if url.startswith('/'):
@@ -64,36 +66,23 @@ class TheAtlantic(BasicNewsRecipe):
self.log('\t\t', desc)
articles.append({'title':title, 'url':url, 'description':desc,
'date':''})
- feeds.append((section_title, articles))
+ if articles:
+ feeds.append((section_title, articles))
poems = []
self.log('Found section: Poems')
- for poem in soup.findAll('div', attrs={'class':'poem'}):
- title = self.tag_to_string(poem.find('h4'))
- desc = self.tag_to_string(poem.find(attrs={'class':'author'}))
+ pd = soup.find('h2', text='Poetry').parent.parent
+ for poem in pd.findAll('h4'):
+ title = self.tag_to_string(poem)
url = poem.find('a')['href']
if url.startswith('/'):
url = 'http://www.theatlantic.com' + url
self.log('\tFound article:', title, 'at', url)
- self.log('\t\t', desc)
- poems.append({'title':title, 'url':url, 'description':desc,
+ poems.append({'title':title, 'url':url, 'description':'',
'date':''})
if poems:
feeds.append(('Poems', poems))
- div = soup.find(id='advice')
- if div is not None:
- self.log('Found section: Advice')
- title = self.tag_to_string(div.find('h4'))
- url = div.find('a')['href']
- if url.startswith('/'):
- url = 'http://www.theatlantic.com' + url
- desc = self.tag_to_string(div.find('p'))
- self.log('\tFound article:', title, 'at', url)
- self.log('\t\t', desc)
-
- feeds.append(('Advice', [{'title':title, 'url':url, 'description':desc,
- 'date':''}]))
return feeds
def postprocess_html(self, soup, first):
diff --git a/resources/recipes/baltimore_sun.recipe b/resources/recipes/baltimore_sun.recipe
new file mode 100644
index 0000000000..ac6906a5e6
--- /dev/null
+++ b/resources/recipes/baltimore_sun.recipe
@@ -0,0 +1,186 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = 'Original 2009, Kovid Goyal '
+__copyright__= 'Modified 2011, Josh Hall '
+__docformat__ = 'restructuredtext en'
+
+'''
+www.baltimoresun.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class BaltimoreSun(BasicNewsRecipe):
+
+ title = 'The Baltimore Sun'
+ __author__ = 'Josh Hall'
+ description = 'Politics, local and business news from Baltimore'
+ language = 'en'
+ oldest_article = 1
+ max_articles_per_feed = 100
+ remove_empty_feeds = True
+ use_embedded_content = False
+ no_stylesheets = True
+ remove_javascript = True
+ #masthead_url = 'http://www.baltimoresun.com/images/thirdpartylogo.gif'
+
+ remove_tags_before = dict(name='div', attrs={'class':['story', 'entry']})
+ remove_tags_after = [
+ {'class':['photo_article',]},
+ dict(name='div', attrs={'class':'shirttail-promo right clearfix'}),
+ ]
+
+ keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}),
+ dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}),
+ ]
+
+
+ remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer","article-promo"]},
+ {'class':["entry-footer-left","entry-footer-right","shirttail-promo right clearfix","clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent","toppaginate","module","module-header","module-content"]},
+ dict(name='font',attrs={'id':["cr-other-headlines"]}),
+ dict(name=['iframe']),
+ ]
+ extra_css = '''
+ h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
+ h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+ .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
+ .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
+ p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
+ .story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
+ '''
+ feeds = [
+ (u'Top Headlines', u'http://www.baltimoresun.com/rss2.0.xml'),
+ (u'Breaking News', u'http://www.baltimoresun.com/news/breaking/rss2.0.xml'),
+ (u'Top Maryland', u'http://www.baltimoresun.com/news/maryland/rss2.0.xml'),
+ #(u'Anne Arundel County', u'http://www.baltimoresun.com/news/maryland/anne-arundel/rss2.0.xml'),
+ (u'Baltimore City', u'http://www.baltimoresun.com/news/maryland/baltimore-city/rss2.0.xml'),
+ #(u'Baltimore County', u'http://www.baltimoresun.com/news/maryland/baltimore-county/rss2.0.xml'),
+ #(u'Carroll County', u'http://www.baltimoresun.com/news/maryland/carroll/rss2.0.xml'),
+ #(u'Harford County', u'http://www.baltimoresun.com/news/maryland/harford/rss2.0.xml'),
+ #(u'Howard County', u'http://www.baltimoresun.com/news/maryland/howard/rss2.0.xml'),
+ (u'Education', u'http://www.baltimoresun.com/news/education/rss2.0.xml'),
+ #(u'Obituaries', u'http://www.baltimoresun.com/news/obituaries/rss2.0.xml'),
+ (u'Local Politics', u'http://www.baltimoresun.com/news/maryland/politics/rss2.0.xml'),
+ (u'Weather', u'http://www.baltimoresun.com/news/weather/rss2.0.xml'),
+ #(u'Traffic', u'http://www.baltimoresun.com/features/commuting/rss2.0.xml'),
+ (u'Nation/world', u'http://feeds.chicagotribune.com/chicagotribune/news/nationworld/'),
+ (u'Weird News', u'http://www.baltimoresun.com/news/offbeat/rss2.0.xml'),
+
+
+ (u'Top Sports', u'http://www.baltimoresun.com/sports/rss2.0.xml'),
+ (u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'),
+ (u'Ravens/Football', u'http://www.baltimoresun.com/sports/ravens/rss2.0.xml'),
+ #(u'Terps', u'http://www.baltimoresun.com/sports/terps/rss2.0.xml'),
+ #(u'College Football', u'http://www.baltimoresun.com/sports/college/football/rss2.0.xml'),
+ #(u'Lacrosse', u'http://www.baltimoresun.com/sports/college/lacrosse/rss2.0.xml'),
+ #(u'Horse Racing', u'http://www.baltimoresun.com/sports/horse-racing/rss2.0.xml'),
+ #(u'Golf', u'http://www.baltimoresun.com/sports/golf/rss2.0.xml'),
+ #(u'NBA', u'http://www.baltimoresun.com/sports/nba/rss2.0.xml'),
+ #(u'High School', u'http://www.baltimoresun.com/sports/high-school/rss2.0.xml'),
+ #(u'Outdoors', u'http://www.baltimoresun.com/sports/outdoors/rss2.0.xml'),
+
+ (u'Celebrity News', u'http://www.baltimoresun.com/entertainment/celebrities/rss2.0.xml'),
+ (u'Arts & Theater', u'http://www.baltimoresun.com/entertainment/arts/rss2.0.xml'),
+ (u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'),
+ (u'Music & Nightlife', u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'),
+ (u'Restaurants & Food', u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'),
+ (u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'),
+
+ (u'Health&Wellness', u'http://www.baltimoresun.com/health/rss2.0.xml'),
+ (u'Home & Garden', u'http://www.baltimoresun.com/features/home-garden/rss2.0.xml'),
+ (u'Living Green', u'http://www.baltimoresun.com/features/green/rss2.0.xml'),
+ (u'Parenting', u'http://www.baltimoresun.com/features/parenting/rss2.0.xml'),
+ (u'Fashion', u'http://www.baltimoresun.com/features/fashion/rss2.0.xml'),
+ (u'Travel', u'http://www.baltimoresun.com/travel/rss2.0.xml'),
+ (u'Faith', u'http://www.baltimoresun.com/features/faith/rss2.0.xml'),
+
+ (u'Top Business', u'http://www.baltimoresun.com/business/rss2.0.xml'),
+ (u'Technology', u'http://www.baltimoresun.com/business/technology/rss2.0.xml'),
+ (u'Personal finance', u'http://www.baltimoresun.com/business/money/rss2.0.xml'),
+ (u'Real Estate', u'http://www.baltimoresun.com/classified/realestate/rss2.0.xml'),
+ (u'Jobs', u'http://www.baltimoresun.com/classified/jobs/rss2.0.xml'),
+ (u'DIY', u'http://www.baltimoresun.com/features/do-it-yourself/rss2.0.xml'),
+ (u'Consumer Safety', u'http://www.baltimoresun.com/business/consumer-safety/rss2.0.xml'),
+ (u'Investing', u'http://www.baltimoresun.com/business/money/rss2.0.xml'),
+
+ (u'Sun Editorials', u'http://www.baltimoresun.com/news/opinion/editorial/rss2.0.xml'),
+ (u'Op/Ed', u'http://www.baltimoresun.com/news/opinion/oped/rss2.0.xml'),
+ (u'Readers Respond', u'http://www.baltimoresun.com/news/opinion/readersrespond/'),
+
+ (u'Kevin Cowherd', 'http://www.baltimoresun.com/sports/bal-columnist-cowherd,0,6829726.columnist-rss2.0.xml'),
+ (u'Jay Hancock', u'http://www.baltimoresun.com/business/money/bal-columnist-hancock,0,6673611.columnist-rss2.0.xml'),
+ (u'Jacques Kelly', u'http://www.baltimoresun.com/news/maryland/bal-columnist-kelly,0,1154701.columnist-rss2.0.xml'),
+ (u'Marta H. Mossburg', u'http://www.baltimoresun.com/news/opinion/oped/bal-columnist-mossburg,0,7982155.columnist-rss2.0.xml'),
+ (u'Mike Preston', u'http://www.baltimoresun.com/sports/bal-columnist-preston,0,6169796.columnist-rss2.0.xml'),
+ (u'Susan Reimer', u'http://www.baltimoresun.com/news/opinion/bal-columnist-reimer,0,162466.columnist-rss2.0.xml'),
+ (u'Dan Rodricks', u'http://www.baltimoresun.com/news/maryland/bal-columnist-rodricks,0,7089843.columnist-rss2.0.xml'),
+ (u'Thomas F. Schaller', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-schaller,0,897397.columnist-rss2.0.xml'),
+ (u'Peter Schmuck', u'http://www.baltimoresun.com/sports/bal-columnist-schmuck,0,7485088.columnist-rss2.0.xml'),
+ (u'Ron Smith', u'http://www.baltimoresun.com/news/opinion/bal-columnist-ronsmith,0,3964803.columnist-rss2.0.xml'),
+
+ (u'Baltimore Crime Beat', u'http://weblogs.baltimoresun.com/news/crime/blog/index.xml'),
+ (u'Getting There', u'http://weblogs.baltimoresun.com/news/traffic/index.xml'),
+ (u'InsideEd', u'http://weblogs.baltimoresun.com/news/education/blog/index.xml'),
+ (u'Maryland Politics', u'http://weblogs.baltimoresun.com/news/local/politics/index.xml'),
+ (u'Maryland Weather', u'http://weblogs.marylandweather.com/index.xml'),
+ (u'Second Opinion', u'http://weblogs.baltimoresun.com/news/opinion/index.xml'),
+ (u'You Dont Say', u'http://weblogs.baltimoresun.com/news/mcintyre/blog/index.xml'),
+
+ (u'BaltTech', u'http://weblogs.baltimoresun.com/news/technology/index.xml'),
+ (u'Consuming Interests', u'http://weblogs.baltimoresun.com/business/consuminginterests/blog/index.xml'),
+ (u'Jay Hancocks Blog', u'http://weblogs.baltimoresun.com/business/hancock/blog/index.xml'),
+ (u'The Real Estate Wonk', u'http://weblogs.baltimoresun.com/business/realestate/blog/index.xml'),
+
+ (u'Clef Notes', 'http://weblogs.baltimoresun.com/entertainment/classicalmusic/index.xml'),
+ (u'Dining at Large', u'http://weblogs.baltimoresun.com/entertainment/dining/reviews/blog/index.xml'),
+ (u'Midnight Sun', u'http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/index.xml'),
+ (u'Mike Sragow Gets Reel', u'http://weblogs.baltimoresun.com/entertainment/movies/blog/index.xml'),
+ (u'Read Street', u'http://weblogs.baltimoresun.com/entertainment/books/blog/index.xml'),
+ (u'Reality Check', u'http://weblogs.baltimoresun.com/entertainment/realitycheck/blog/index.xml'),
+ (u'Z on TV', u'http://weblogs.baltimoresun.com/entertainment/zontv/index.xml'),
+
+ (u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'),
+ (u'Charm City Moms', u'http://weblogs.baltimoresun.com/features/baltimoremomblog/index.xml'),
+ (u'Exercists', u'http://weblogs.baltimoresun.com/health/fitness/index.xml'),
+ (u'Garden Variety', 'http://weblogs.baltimoresun.com/features/gardening/index.xml'),
+ #(u'In Good Faith', u'http://weblogs.baltimoresun.com/news/faith/index.xml'),
+ (u'Picture of Health', u'http://weblogs.baltimoresun.com/health/index.xml'),
+ (u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'),
+
+ #(u'Faceoff', u'http://weblogs.baltimoresun.com/sports/lacrosse/blog/index.xml'),
+ #(u'MMA Stomping Grounds', u'http://weblogs.baltimoresun.com/sports/mma/blog/index.xml'),
+ (u'Orioles Insider', u'http://weblogs.baltimoresun.com/sports/orioles/blog/index.xml'),
+ #(u'Outdoors Girl', u'http://weblogs.baltimoresun.com/sports/outdoors/blog/index.xml'),
+ (u'Ravens Insider', u'http://weblogs.baltimoresun.com/sports/ravens/blog/index.xml'),
+ #(u'Recruiting Report', u'http://weblogs.baltimoresun.com/sports/college/recruiting/index.xml'),
+ #(u'Ring Posts', u'http://weblogs.baltimoresun.com/sports/wrestling/blog/index.xml'),
+ (u'The Schmuck Stops Here', u'http://weblogs.baltimoresun.com/sports/schmuck/index.xml'),
+ (u'Toy Department', u'http://weblogs.baltimoresun.com/sports/thetoydepartment/index.xml'),
+ #(u'Tracking the Terps', u'http://weblogs.baltimoresun.com/sports/college/maryland_terps/blog/index.xml'),
+ #(u'Varsity Letters', u'http://weblogs.baltimoresun.com/sports/highschool/varsityletters/index.xml'),
+ (u'Virtual Vensanity', u'http://weblogs.baltimoresun.com/entertainment/bthesite/vensel/index.xml'),
+
+ ]
+
+
+ def get_article_url(self, article):
+ print article.get('feedburner_origlink', article.get('guid', article.get('link')))
+ return article.get('feedburner_origlink', article.get('guid', article.get('link')))
+
+
+ def postprocess_html(self, soup, first_fetch):
+ for t in soup.findAll(['table', 'tr', 'td']):
+ t.name = 'div'
+
+ for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
+ tag.extract()
+ for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
+ tag.extract()
+
+ return soup
diff --git a/resources/recipes/danas.recipe b/resources/recipes/danas.recipe
index 9002a6b505..d27b88a49c 100644
--- a/resources/recipes/danas.recipe
+++ b/resources/recipes/danas.recipe
@@ -1,5 +1,5 @@
__license__ = 'GPL v3'
-__copyright__ = '2008-2010, Darko Miletic '
+__copyright__ = '2008-2011, Darko Miletic '
'''
danas.rs
'''
@@ -33,13 +33,15 @@ class Danas(BasicNewsRecipe):
margin-bottom: 0;
margin-top: 0}
h2,.datum,.lokacija,.autor{font-size: small}
+ .autor{text-transform: uppercase}
.antrfileNaslov{border-left: 2px solid #999999;
margin-left: 0.8em;
padding-left: 1.2em;
font-weight:bold;
margin-bottom: 0;
margin-top: 0}
- img{margin-bottom: 0.8em}
+ img{margin-bottom: 0.8em}
+ .naslovTemeDana{font-size: small}
"""
conversion_options = {
@@ -62,6 +64,7 @@ class Danas(BasicNewsRecipe):
,(re.compile(u'\u00f4'), lambda match: '“') # latin small letter o with circumflex
,(re.compile(u'\u00f6'), lambda match: '”') # latin small letter o with dieaeresis
,(re.compile(u'\u00e1'), lambda match: ' ' ) # latin small letter a with acute
+ ,(re.compile(u'\u00e4'), lambda match: ' ' ) # latin small letter a with dieaeresis
]
keep_only_tags = [dict(name='div', attrs={'id':'left'})]
@@ -124,6 +127,6 @@ class Danas(BasicNewsRecipe):
cover_url = None
soup = self.index_to_soup('http://www.danas.rs/')
for citem in soup.findAll('img'):
- if citem['src'].endswith('naslovna.jpg'):
+ if citem['src'].endswith('naslovna.jpg') or citem['src'].endswith('naslovna1.jpg'):
return 'http://www.danas.rs' + citem['src']
return cover_url
diff --git a/resources/recipes/deia.recipe b/resources/recipes/deia.recipe
new file mode 100644
index 0000000000..980d59d3d1
--- /dev/null
+++ b/resources/recipes/deia.recipe
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__author__ = 'Gerardo Diez'
+__copyright__ = 'Gerardo Diez'
+description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
+__docformat__ = 'restructuredtext en'
+
+'''
+deia.com
+'''
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class Deia(BasicNewsRecipe):
+ title ='Deia'
+ __author__ ='Gerardo Diez'
+ publisher ='Editorial Iparraguirre, S.A'
+ category ='news, politics, finances, world, spain, euskadi'
+ publication_type ='newspaper'
+ oldest_article =1
+ max_articles_per_feed =100
+ simultaneous_downloads =10
+ cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
+ timefmt ='[%a, %d %b, %Y]'
+ encoding ='utf8'
+ language ='es_ES'
+ remove_javascript =True
+ remove_tags_after =dict(id='Texto')
+ remove_tags_before =dict(id='Texto')
+ remove_tags =[dict(name='div', attrs={'class':['Herramientas ', 'Multimedia']})]
+ no_stylesheets =True
+ extra_css ='h1 {margin-bottom: .15em;font-size: 2.7em; font-family: Georgia, "Times New Roman", Times, serif;} .Antetitulo {margin: 1em 0;text-transform: uppercase;color: #999;} .PieFoto {margin: .1em 0;padding: .5em .5em .5em .5em;background: #F0F0F0;} .PieFoto p {margin-bottom: 0;font-family: Georgia,"Times New Roman",Times,serif;font-weight: bold; font-style: italic; color: #666;}'
+ keep_only_tags =[dict(name='div', attrs={'class':['Texto ', 'NoticiaFicha ']})]
+ feeds = [
+ (u'Bizkaia' ,u'http://www.deia.com/index.php/services/rss?seccion=bizkaia'),
+ (u'Bilbao' ,u'http://www.deia.com/index.php/services/rss?seccion=bilbao'),
+ (u'Hemendik eta Handik' ,u'http://www.deia.com/index.php/services/rss?seccion=hemendik-eta-handik'),
+ (u'Margen Derecha' ,u'http://www.deia.com/index.php/services/rss?seccion=margen-derecha'),
+ (u'Encartaciones y Margen Izquierda' ,u'http://www.deia.com/index.php/services/rss?seccion=margen-izquierda-encartaciones'),
+ (u'Costa' ,u'http://www.deia.com/index.php/services/rss?seccion=costa'),
+ (u'Duranguesado' ,u'http://www.deia.com/index.php/services/rss?seccion=duranguesado'),
+ (u'Llodio-Nervión' ,u'http://www.deia.com/index.php/services/rss?seccion=llodio-nervion'),
+ (u'Arratia-Nervión' ,u'http://www.deia.com/index.php/services/rss?seccion=arratia-nervion'),
+ (u'Uribe-Txorierri' ,u'http://www.deia.com/index.php/services/rss?seccion=uribe-txorierri'),
+ (u'Ecos de sociedad' ,u'http://www.deia.com/index.php/services/rss?seccion=ecos-de-sociedad'),
+ (u'Sucesos' ,u'http://www.deia.com/index.php/services/rss?seccion=sucesos'),
+ (u'Política' ,u'http://www.deia.com/index.php/services/rss?seccion=politica'),
+ (u'Euskadi' ,u'http://www.deia.com/index.php/services/rss?seccion=politica/euskadi'),
+ (u'España' ,u'http://www.deia.com/index.php/services/rss?seccion=politica/espana'),
+ (u'Sociedad',u'http://www.deia.com/index.php/services/rss?seccion=sociedad'),
+ (u'Euskadi' ,u'http://www.deia.com/index.php/services/rss?seccion=socidad/euskadi'),
+ (u'Sociedad.España' ,u'http://www.deia.com/index.php/services/rss?seccion=sociedad/espana'),
+ (u'Ocio y Cultura' ,u'http://www.deia.com/index.php/services/rss?seccion=ocio-y-cultura'),
+ #(u'Cultura' ,u'http://www.deia.com/index.php/services/rss?seccion=cultura'),
+ #(u'Ocio' ,u'http://www.deia.com/index.php/services/rss?seccion=ocio'),
+ (u'On' ,u'http://www.deia.com/index.php/services/rss?seccion=on'),
+ (u'Agenda' ,u'http://www.deia.com/index.php/services/rss?seccion=agenda'),
+ (u'Comunicación' ,u'http://www.deia.com/index.php/services/rss?seccion=comunicacion'),
+ (u'Viajes' ,u'http://www.deia.com/index.php/services/rss?seccion=viajes'),
+ (u'¡Mundo!' ,u'http://www.deia.com/index.php/services/rss?seccion=que-mundo'),
+ (u'Humor' ,u'http://www.deia.com/index.php/services/rss?seccion=humor'),
+ (u'Opinión' ,u'http://www.deia.com/index.php/services/rss?seccion=opinion'),
+ (u'Editorial' ,u'http://www.deia.com/index.php/services/rss?seccion=editorial'),
+ (u'Tribuna abierta' ,u'http://www.deia.com/index.php/services/rss?seccion=tribuna-abierta'),
+ (u'Colaboración' ,u'http://www.deia.com/index.php/services/rss?seccion=colaboracion'),
+ (u'Columnistas' ,u'http://www.deia.com/index.php/services/rss?seccion=columnistas'),
+ (u'Deportes' ,u'http://www.deia.com/index.php/services/rss?seccion=deportes'),
+ (u'Athletic' ,u'http://www.deia.com/index.php/services/rss?seccion=athletic'),
+ (u'Economía' ,'http://www.deia.com/index.php/services/rss?seccion=economia'),
+ (u'Mundo' ,u'http://www.deia.com/index.php/services/rss?seccion=mundo')]
+
diff --git a/resources/recipes/ibm_smarter_planet.recipe b/resources/recipes/ibm_smarter_planet.recipe
new file mode 100644
index 0000000000..2e5c46fb80
--- /dev/null
+++ b/resources/recipes/ibm_smarter_planet.recipe
@@ -0,0 +1,23 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1293122276(BasicNewsRecipe):
+ title = u'Smarter Planet | Tumblr for eReaders'
+ __author__ = 'Jack Mason'
+ author = 'IBM Global Business Services'
+ publisher = 'IBM'
+ category = 'news, technology, IT, internet of things, analytics'
+ oldest_article = 7
+ max_articles_per_feed = 30
+ no_stylesheets = True
+ use_embedded_content = False
+ masthead_url = 'http://30.media.tumblr.com/tumblr_l70dow9UmU1qzs4rbo1_r3_250.jpg'
+ remove_tags_before = dict(id='item')
+ remove_tags_after = dict(id='item')
+ remove_tags = [dict(attrs={'class':['sidebar', 'about', 'footer', 'description,' 'disqus', 'nav', 'notes', 'disqus_thread']}),
+ dict(id=['sidebar', 'footer', 'disqus', 'nav', 'notes', 'likes_container', 'description', 'disqus_thread', 'about']),
+ dict(name=['script', 'noscript', 'style'])]
+
+
+
+ feeds = [(u'Smarter Planet Tumblr', u'http://smarterplanet.tumblr.com/mobile/rss')]
+
diff --git a/resources/recipes/ledevoir.recipe b/resources/recipes/ledevoir.recipe
index 97b33c43a7..c54f21c7ec 100644
--- a/resources/recipes/ledevoir.recipe
+++ b/resources/recipes/ledevoir.recipe
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini'
__copyright__ = '2009, Lorenzo Vigentini '
@@ -14,7 +13,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class ledevoir(BasicNewsRecipe):
author = 'Lorenzo Vigentini'
- description = 'Canadian Paper'
+ description = 'Canadian Paper. A subscription is optional, with it you get more content'
cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif'
title = u'Le Devoir'
@@ -28,6 +27,7 @@ class ledevoir(BasicNewsRecipe):
max_articles_per_feed = 50
use_embedded_content = False
recursion = 10
+ needs_subscription = 'optional'
remove_javascript = True
no_stylesheets = True
@@ -77,3 +77,12 @@ class ledevoir(BasicNewsRecipe):
.credit {color:#787878;font-size:0.71em;line-height:1.1em;font-weight:bold;}
.texte {font-size:1.15em;line-height:1.4em;margin-bottom:17px;}
'''
+ def get_browser(self):
+ br = BasicNewsRecipe.get_browser()
+ if self.username is not None and self.password is not None:
+ br.open('http://www.ledevoir.com')
+ br.select_form(nr=1)
+ br['login[courriel]'] = self.username
+ br['login[password]'] = self.password
+ br.submit()
+ return br
diff --git a/resources/recipes/new_london_day.recipe b/resources/recipes/new_london_day.recipe
new file mode 100644
index 0000000000..bc8c44e40e
--- /dev/null
+++ b/resources/recipes/new_london_day.recipe
@@ -0,0 +1,74 @@
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1294342201(BasicNewsRecipe):
+ title = u'New London Day'
+ __author__ = 'Being'
+ description = 'State, local and business news from New London, CT'
+ language = 'en_GB'
+ oldest_article = 1
+ max_articles_per_feed = 200
+
+ use_embedded_content = False
+ no_stylesheets = True
+ remove_javascript = True
+ remove_tags_before = dict(id='article')
+ remove_tags_after = dict(id='article')
+ remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
+ dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
+ dict(name=['script', 'noscript', 'style'])]
+ remove_tags_after = [ {'class':['photo_article',]} ]
+ remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]},
+ {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]},
+ dict(name='font',attrs={'id':["cr-other-headlines"]})]
+ extra_css = '''
+ h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
+ h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+ .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
+ .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
+ p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
+ .story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
+ '''
+
+ feeds = [
+ (u'All News', u'http://www.theday.com/section/rss'),
+ (u'Breaking News', u'http://www.theday.com/section/rss01'),
+ (u'Police and Courts', u'http://www.theday.com/section/rss02'),
+ (u'State News', u'http://www.theday.com/section/rss03'),
+ (u'Local Business', u'http://www.theday.com/section/rss04'),
+ (u'Entertainment', u'http://www.theday.com/section/rss05'),
+ (u'Opinion', u'http://www.theday.com/section/rss06'),
+ (u'Casinos', u'http://www.theday.com/section/rss12'),
+ (u'Defense and Military', u'http://www.theday.com/section/rss14'),
+ (u'Ann Baldelli Ruminations', u'http://www.theday.com/section/rss20'),
+ (u'Paul Choiniere Ruminations', u'http://www.theday.com/section/rss21'),
+ (u'Michael Costanza Omnivore', u'http://www.theday.com/section/rss23'),
+ (u'Rebecca Dangelo Reel Life', u'http://www.theday.com/section/rss25'),]
+
+ def print_version(self, url):
+ return url.replace('/index.html', '/print.html')
+
+ def get_article_url(self, article):
+ return article.get('feedburner_origlink', article.get('guid', article.get('link')))
+
+
+ def postprocess_html(self, soup, first_fetch):
+ for t in soup.findAll(['table', 'tr', 'td']):
+ t.name = 'div'
+
+ for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
+ tag.extract()
+ for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
+ tag.extract()
+
+ return soup
+
diff --git a/resources/recipes/njp.recipe b/resources/recipes/njp.recipe
new file mode 100644
index 0000000000..996aef2fdf
--- /dev/null
+++ b/resources/recipes/njp.recipe
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL v3'
+__copyright__ = u'Chema Cort\xe9s - 2011-01-05'
+__version__ = 'v0.01'
+__date__ = '2011-01-05'
+'''
+njp.org
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class NewJournalOfPhysics(BasicNewsRecipe):
+ title = u'New Journal of Physics'
+ __author__ = u'Chema Cort\xe9s'
+ description = u'The open-access journal for physics'
+ publisher = u'IOP (Institute of Physics)'
+ category = 'physics, journal, science'
+ language = 'en'
+
+ oldest_article = 30
+ max_articles_per_feed = 100
+
+ keep_only_tags = [dict(id=['fulltextContainer'])]
+ no_stylesheets=True
+ use_embedded_content=False
+
+ feeds = [(u'Latest Papers', u'http://iopscience.iop.org/1367-2630/?rss=1')]
+
+ def print_version(self, url):
+ return url+"/fulltext"
diff --git a/resources/recipes/sunday_times.recipe b/resources/recipes/sunday_times.recipe
new file mode 100644
index 0000000000..1f20f73cd9
--- /dev/null
+++ b/resources/recipes/sunday_times.recipe
@@ -0,0 +1,115 @@
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, Darko Miletic '
+'''
+www.thesundaytimes.co.uk
+'''
+import urllib
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class TimesOnline(BasicNewsRecipe):
+ title = 'The Sunday Times UK'
+ __author__ = 'Darko Miletic'
+ description = 'news from United Kingdom and World'
+ language = 'en_GB'
+ publisher = 'Times Newspapers Ltd'
+ category = 'news, politics, UK'
+ oldest_article = 3
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ encoding = 'utf-8'
+ delay = 1
+ needs_subscription = True
+ publication_type = 'newspaper'
+ masthead_url = 'http://www.thesundaytimes.co.uk/sto/public/images/logos/logo-home.gif'
+ INDEX = 'http://www.thesundaytimes.co.uk'
+ PREFIX = u'http://www.thesundaytimes.co.uk/sto/'
+ extra_css = """
+ .author-name,.authorName{font-style: italic}
+ .published-date,.multi-position-photo-text{font-family: Arial,Helvetica,sans-serif;
+ font-size: small; color: gray;
+ display:block; margin-bottom: 0.5em}
+ body{font-family: Georgia,"Times New Roman",Times,serif}
+ """
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
+ }
+
+
+ def get_browser(self):
+ br = BasicNewsRecipe.get_browser()
+ br.open('http://www.timesplus.co.uk/tto/news/?login=false&url=http://www.thesundaytimes.co.uk/sto/')
+ if self.username is not None and self.password is not None:
+ data = urllib.urlencode({ 'userName':self.username
+ ,'password':self.password
+ ,'keepMeLoggedIn':'false'
+ })
+ br.open('https://www.timesplus.co.uk/iam/app/authenticate',data)
+ return br
+
+ remove_tags = [
+ dict(name=['object','link','iframe','base','meta'])
+ ,dict(attrs={'class':'tools comments-parent' })
+ ]
+ remove_attributes=['lang']
+ keep_only_tags = [
+ dict(attrs={'class':'standard-content'})
+ ,dict(attrs={'class':'f-author'})
+ ,dict(attrs={'id':'bodycopy'})
+ ]
+ remove_tags_after=dict(attrs={'class':'tools_border'})
+
+ feeds = [
+ (u'UK News' , PREFIX + u'news/uk_news/' )
+ ,(u'World' , PREFIX + u'news/world_news/' )
+ ,(u'Politics' , PREFIX + u'news/Politics/' )
+ ,(u'Focus' , PREFIX + u'news/focus/' )
+ ,(u'Insight' , PREFIX + u'news/insight/' )
+ ,(u'Ireland' , PREFIX + u'news/ireland/' )
+ ,(u'Columns' , PREFIX + u'comment/columns/' )
+ ,(u'Arts' , PREFIX + u'culture/arts/' )
+ ,(u'Books' , PREFIX + u'culture/books/' )
+ ,(u'Film and TV' , PREFIX + u'culture/film_and_tv/' )
+ ,(u'Sport' , PREFIX + u'sport/' )
+ ,(u'Business' , PREFIX + u'business' )
+ ,(u'Money' , PREFIX + u'business/money/' )
+ ,(u'Style' , PREFIX + u'style/' )
+ ,(u'Travel' , PREFIX + u'travel/' )
+ ,(u'Clarkson' , PREFIX + u'ingear/clarkson/' )
+ ,(u'Cars' , PREFIX + u'ingear/cars/' )
+ ,(u'Bikes' , PREFIX + u'ingear/2_Wheels/' )
+ ,(u'Tech' , PREFIX + u'ingear/Tech___Games/' )
+ ,(u'Magazine' , PREFIX + u'Magazine/' )
+ ]
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ return self.adeify_images(soup)
+
+ def parse_index(self):
+ totalfeeds = []
+ lfeeds = self.get_feeds()
+ for feedobj in lfeeds:
+ feedtitle, feedurl = feedobj
+ self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
+ articles = []
+ soup = self.index_to_soup(feedurl)
+ for atag in soup.findAll('a',href=True):
+ parentName = atag.parent.name
+ title = self.tag_to_string(atag).strip()
+ if (parentName == 'h2' or parentName == 'h3') and title is not None and title != '':
+ url = self.INDEX + atag['href']
+ articles.append({
+ 'title' :title
+ ,'date' :''
+ ,'url' :url
+ ,'description':''
+ })
+ totalfeeds.append((feedtitle, articles))
+ return totalfeeds
diff --git a/resources/recipes/walla.recipe b/resources/recipes/walla.recipe
new file mode 100644
index 0000000000..5fbfed7a03
--- /dev/null
+++ b/resources/recipes/walla.recipe
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1283848012(BasicNewsRecipe):
+ description = 'The WallaNews.'
+ cover_url = 'http://ftp5.bizportal.co.il/web/giflib/news/rsPhoto/sz_5/rsz_220_220_logo_walla.gif'
+ title = u'Walla'
+ language = 'he'
+ __author__ = 'marbs'
+ extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }'
+ simultaneous_downloads = 5
+# remove_javascript = True
+ timefmt = '[%a, %d %b, %Y]'
+ oldest_article = 1
+ max_articles_per_feed = 100
+ # remove_attributes = ['width']
+ keep_only_tags =dict(name='div', attrs={'class':'wp-0-b w3'})
+ remove_tags = [dict(name='div', attrs={'class':'tagsContainer'})]
+ max_articles_per_feed = 100
+# preprocess_regexps = [
+# (re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: '')
+# ]
+
+
+ feeds = [(u'חדשות', u'http://rss.walla.co.il/?w=/1/0/1/@rss'),
+ (u'עסקים', u'http://rss.walla.co.il/?w=/2/3/1/@rss'),
+ (u'תרבות', u'http://rss.walla.co.il/?w=/4/249/1/@rss'),
+ (u'בריאות', u'http://rss.walla.co.il/?w=/5/18/1/@rss'),
+ (u'TECH', u'http://rss.walla.co.il/?w=/6/4/1/@rss'),
+ (u'אסטרולוגיה', u'http://rss.walla.co.il/?w=/8/3307/1/@rss'),
+ (u'בעלי חיים', u'http://rss.walla.co.il/?w=/59/5703/1/@rss'),
+ (u'רכב', u'http://rss.walla.co.il/?w=/31/4700/1/@rss'),
+ (u'סלבס', u'http://rss.walla.co.il/?w=/22/3600/1/@rss'),
+ (u'אוכל', u'http://rss.walla.co.il/?w=/9/903/1/@rss'),
+ (u'אופנה', u'http://rss.walla.co.il/?w=/24/2120/1/@rss'),
+ (u'ברנזה', u'http://rss.walla.co.il/?w=/27/3900/1/@rss'),
+ (u'ZONE', u'http://rss.walla.co.il/?w=/18/500/1/@rss'),
+ (u'ספורט', u'http://rss.walla.co.il/?w=/3/7/1/@rss')]
+
+ def print_version(self, url):
+ print_url = url + '/@@/item/printer'
+ return print_url
+
diff --git a/resources/viewer/bookmarks.js b/resources/viewer/bookmarks.js
index d36e7c579a..253524326f 100644
--- a/resources/viewer/bookmarks.js
+++ b/resources/viewer/bookmarks.js
@@ -41,6 +41,7 @@ function scroll_to_bookmark(bookmark) {
$.scrollTo($(bm[0]), 1000,
{
over:ratio,
+ axis: 'y', // Do not scroll in the x direction
onAfter:function(){window.py_bridge.animated_scroll_done()}
}
);
diff --git a/setup/build_environment.py b/setup/build_environment.py
index d6581a907d..10ab1b0735 100644
--- a/setup/build_environment.py
+++ b/setup/build_environment.py
@@ -121,7 +121,7 @@ if iswindows:
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
popplerqt4_lib_dirs = poppler_lib_dirs
poppler_libs = ['poppler']
- magick_inc_dirs = [os.path.join(prefix, 'build', 'ImageMagick-6.5.6')]
+ magick_inc_dirs = [os.path.join(prefix, 'build', 'ImageMagick-6.6.6')]
magick_lib_dirs = [os.path.join(magick_inc_dirs[0], 'VisualMagick', 'lib')]
magick_libs = ['CORE_RL_wand_', 'CORE_RL_magick_']
podofo_inc = os.path.join(sw_inc_dir, 'podofo')
diff --git a/setup/installer/windows/freeze.py b/setup/installer/windows/freeze.py
index 7d8ea4d80a..e9e47816fd 100644
--- a/setup/installer/windows/freeze.py
+++ b/setup/installer/windows/freeze.py
@@ -18,7 +18,7 @@ QT_DLLS = ['Core', 'Gui', 'Network', 'Svg', 'WebKit', 'Xml', 'XmlPatterns']
LIBUSB_DIR = 'C:\\libusb'
LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll'
SW = r'C:\cygwin\home\kovid\sw'
-IMAGEMAGICK = os.path.join(SW, 'build', 'ImageMagick-6.5.6',
+IMAGEMAGICK = os.path.join(SW, 'build', 'ImageMagick-6.6.6',
'VisualMagick', 'bin')
VERSION = re.sub('[a-z]\d+', '', __version__)
diff --git a/setup/installer/windows/notes.rst b/setup/installer/windows/notes.rst
index b9aef39657..5dfd956ce2 100644
--- a/setup/installer/windows/notes.rst
+++ b/setup/installer/windows/notes.rst
@@ -301,12 +301,14 @@ int projectType = MULTITHREADEDDLL;
Run configure.bat in a visual studio command prompt
+Run configure.exe generated by configure.bat
+
Edit magick/magick-config.h
Undefine ProvideDllMain and MAGICKCORE_X11_DELEGATE
Now open VisualMagick/VisualDynamicMT.sln set to Release
-Remove the CORE_xlib project
+Remove the CORE_xlib and UTIL_Imdisplay project CORE_Magick++
calibre
---------
diff --git a/src/calibre/customize/__init__.py b/src/calibre/customize/__init__.py
index 770d405203..13e1f20a2d 100644
--- a/src/calibre/customize/__init__.py
+++ b/src/calibre/customize/__init__.py
@@ -80,6 +80,100 @@ class Plugin(object): # {{{
'''
pass
+ def config_widget(self):
+ '''
+ Implement this method and :meth:`save_settings` in your plugin to
+ use a custom configuration dialog, rather then relying on the simple
+ string based default customization.
+
+ This method, if implemented, must return a QWidget. The widget can have
+ an optional method validate() that takes no arguments and is called
+ immediately after the user clicks OK. Changes are applied if and only
+ if the method returns True.
+ '''
+ raise NotImplementedError()
+
+ def save_settings(self, config_widget):
+ '''
+ Save the settings specified by the user with config_widget.
+
+ :param config_widget: The widget returned by :meth:`config_widget`.
+
+ '''
+ raise NotImplementedError()
+
+ def do_user_config(self, parent=None):
+ '''
+ This method shows a configuration dialog for this plugin. It returns
+ True if the user clicks OK, False otherwise. The changes are
+ automatically applied.
+ '''
+ from PyQt4.Qt import QDialog, QDialogButtonBox, QVBoxLayout, \
+ QLabel, Qt, QLineEdit
+ from calibre.gui2 import gprefs
+
+ prefname = 'plugin config dialog:'+self.type + ':' + self.name
+ geom = gprefs.get(prefname, None)
+
+ config_dialog = QDialog(parent)
+ button_box = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
+ v = QVBoxLayout(config_dialog)
+
+ def size_dialog():
+ if geom is None:
+ config_dialog.resize(config_dialog.sizeHint())
+ else:
+ config_dialog.restoreGeometry(geom)
+
+ button_box.accepted.connect(config_dialog.accept)
+ button_box.rejected.connect(config_dialog.reject)
+ config_dialog.setWindowTitle(_('Customize') + ' ' + self.name)
+ try:
+ config_widget = self.config_widget()
+ except NotImplementedError:
+ config_widget = None
+
+ if config_widget is not None:
+ v.addWidget(config_widget)
+ v.addWidget(button_box)
+ size_dialog()
+ config_dialog.exec_()
+
+ if config_dialog.result() == QDialog.Accepted:
+ if hasattr(config_widget, 'validate'):
+ if config_widget.validate():
+ self.save_settings(config_widget)
+ else:
+ self.save_settings(config_widget)
+ else:
+ from calibre.customize.ui import plugin_customization, \
+ customize_plugin
+ help_text = self.customization_help(gui=True)
+ help_text = QLabel(help_text, config_dialog)
+ help_text.setWordWrap(True)
+ help_text.setTextInteractionFlags(Qt.LinksAccessibleByMouse
+ | Qt.LinksAccessibleByKeyboard)
+ help_text.setOpenExternalLinks(True)
+ v.addWidget(help_text)
+ sc = plugin_customization(self)
+ if not sc:
+ sc = ''
+ sc = sc.strip()
+ sc = QLineEdit(sc, config_dialog)
+ v.addWidget(sc)
+ v.addWidget(button_box)
+ size_dialog()
+ config_dialog.exec_()
+
+ if config_dialog.result() == QDialog.Accepted:
+ sc = unicode(sc.text()).strip()
+ customize_plugin(self, sc)
+
+ geom = bytearray(config_dialog.saveGeometry())
+ gprefs[prefname] = geom
+
+ return config_dialog.result()
+
def load_resources(self, names):
'''
If this plugin comes in a ZIP file (user added plugin), this method
diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py
index 51ceb94a99..c5e8f5ca0f 100644
--- a/src/calibre/devices/kobo/driver.py
+++ b/src/calibre/devices/kobo/driver.py
@@ -33,7 +33,7 @@ class KOBO(USBMS):
booklist_class = CollectionsBookList
# Ordered list of supported formats
- FORMATS = ['epub', 'pdf']
+ FORMATS = ['epub', 'pdf', 'txt', 'cbz', 'cbr']
CAN_SET_METADATA = ['collections']
VENDOR_ID = [0x2237]
@@ -409,7 +409,7 @@ class KOBO(USBMS):
else:
ContentType = 901
else: # if extension == '.html' or extension == '.txt':
- ContentType = 999 # Yet another hack: to get around Kobo changing how ContentID is stored
+ ContentType = 901 # Yet another hack: to get around Kobo changing how ContentID is stored
return ContentType
def path_from_contentid(self, ContentID, ContentType, MimeType, oncard):
diff --git a/src/calibre/devices/misc.py b/src/calibre/devices/misc.py
index 2a0fdf6433..ecd12ac61d 100644
--- a/src/calibre/devices/misc.py
+++ b/src/calibre/devices/misc.py
@@ -259,7 +259,7 @@ class EEEREADER(USBMS):
PRODUCT_ID = [0x178f]
BCD = [0x0319]
- EBOOK_DIR_MAIN = 'Books'
+ EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'Book'
VENDOR_NAME = 'LINUX'
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'FILE-STOR_GADGET'
diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py
index 6652d581d4..98a7241a36 100644
--- a/src/calibre/devices/prs505/driver.py
+++ b/src/calibre/devices/prs505/driver.py
@@ -61,14 +61,26 @@ class PRS505(USBMS):
ALL_BY_TITLE = _('All by title')
ALL_BY_AUTHOR = _('All by author')
- EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of metadata fields '
+ EXTRA_CUSTOMIZATION_MESSAGE = [
+ _('Comma separated list of metadata fields '
'to turn into collections on the device. Possibilities include: ')+\
'series, tags, authors' +\
_('. Two special collections are available: %s:%s and %s:%s. Add '
'these values to the list to enable them. The collections will be '
'given the name provided after the ":" character.')%(
- 'abt', ALL_BY_TITLE, 'aba', ALL_BY_AUTHOR)
- EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(['series', 'tags'])
+ 'abt', ALL_BY_TITLE, 'aba', ALL_BY_AUTHOR),
+ _('Upload separate cover thumbnails for books (newer readers)') +
+ ':::'+_('Normally, the SONY readers get the cover image from the'
+ ' ebook file itself. With this option, calibre will send a '
+ 'separate cover image to the reader, useful if you are '
+ 'sending DRMed books in which you cannot change the cover.'
+ ' WARNING: This option should only be used with newer '
+ 'SONY readers: 350, 650, 950 and newer.'),
+ ]
+ EXTRA_CUSTOMIZATION_DEFAULT = [
+ ', '.join(['series', 'tags']),
+ False
+ ]
plugboard = None
plugboard_func = None
@@ -159,7 +171,7 @@ class PRS505(USBMS):
opts = self.settings()
if opts.extra_customization:
collections = [x.strip() for x in
- opts.extra_customization.split(',')]
+ opts.extra_customization[0].split(',')]
else:
collections = []
debug_print('PRS505: collection fields:', collections)
@@ -186,8 +198,12 @@ class PRS505(USBMS):
self.plugboard_func = pb_func
def upload_cover(self, path, filename, metadata, filepath):
- return # Disabled as the SONY's don't need this thumbnail anyway and
- # older models don't auto delete it
+ opts = self.settings()
+ if not opts.extra_customization[1]:
+ # Building thumbnails disabled
+ debug_print('PRS505: not uploading covers')
+ return
+ debug_print('PRS505: uploading covers')
if metadata.thumbnail and metadata.thumbnail[-1]:
path = path.replace('/', os.sep)
is_main = path.startswith(self._main_prefix)
diff --git a/src/calibre/devices/usbms/deviceconfig.py b/src/calibre/devices/usbms/deviceconfig.py
index e074387175..940ea96f38 100644
--- a/src/calibre/devices/usbms/deviceconfig.py
+++ b/src/calibre/devices/usbms/deviceconfig.py
@@ -10,7 +10,21 @@ from calibre.utils.config import Config, ConfigProxy
class DeviceConfig(object):
HELP_MESSAGE = _('Configure Device')
+
+ #: Can be None, a string or a list of strings. When it is a string
+ #: that string is used for the help text and the actual customization value
+ #: can be read from ``dev.settings().extra_customization``.
+ #: If it a list of strings, then dev.settings().extra_customization will
+ #: also be a list. In this case, you *must* ensure that
+ #: EXTRA_CUSTOMIZATION_DEFAULT is also a list. The list can contain either
+ #: boolean values or strings, in which case a checkbox or line edit will be
+ #: used for them in the config widget, automatically.
+ #: If a string contains ::: then the text after it is interpreted as the
+ #: tooltip
EXTRA_CUSTOMIZATION_MESSAGE = None
+
+ #: The default value for extra customization. If you set
+ #: EXTRA_CUSTOMIZATION_MESSAGE you *must* set this as well.
EXTRA_CUSTOMIZATION_DEFAULT = None
SUPPORTS_SUB_DIRS = False
@@ -73,16 +87,33 @@ class DeviceConfig(object):
if cls.SUPPORTS_USE_AUTHOR_SORT:
proxy['use_author_sort'] = config_widget.use_author_sort()
if cls.EXTRA_CUSTOMIZATION_MESSAGE:
- ec = unicode(config_widget.opt_extra_customization.text()).strip()
- if not ec:
- ec = None
+ if isinstance(cls.EXTRA_CUSTOMIZATION_MESSAGE, list):
+ ec = []
+ for i in range(0, len(cls.EXTRA_CUSTOMIZATION_MESSAGE)):
+ if hasattr(config_widget.opt_extra_customization[i], 'isChecked'):
+ ec.append(config_widget.opt_extra_customization[i].isChecked())
+ else:
+ ec.append(unicode(config_widget.opt_extra_customization[i].text()).strip())
+ else:
+ ec = unicode(config_widget.opt_extra_customization.text()).strip()
+ if not ec:
+ ec = None
proxy['extra_customization'] = ec
st = unicode(config_widget.opt_save_template.text())
proxy['save_template'] = st
@classmethod
def settings(cls):
- return cls._config().parse()
+ opts = cls._config().parse()
+ if isinstance(cls.EXTRA_CUSTOMIZATION_DEFAULT, list):
+ if opts.extra_customization is None:
+ opts.extra_customization = []
+ if not isinstance(opts.extra_customization, list):
+ opts.extra_customization = [opts.extra_customization]
+ for i,d in enumerate(cls.EXTRA_CUSTOMIZATION_DEFAULT):
+ if i >= len(opts.extra_customization):
+ opts.extra_customization.append(d)
+ return opts
@classmethod
def save_template(cls):
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 3ff816b3bf..29006ffd9b 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -51,16 +51,16 @@ def chap_head(match):
chap = match.group('chap')
title = match.group('title')
if not title:
- return ''+chap+'
\n'
+ return ''+chap+'
\n'
else:
- return ''+chap+'
\n'+title+'
\n'
+ return ''+chap+'
\n'+title+'
\n'
def wrap_lines(match):
ital = match.group('ital')
if not ital:
- return ' '
+ return ' '
else:
- return ital+' '
+ return ital+' '
class DocAnalysis(object):
'''
@@ -191,7 +191,7 @@ class Dehyphenator(object):
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
- lookupword = self.removeprefix.sub('', lookupword)
+ lookupword = self.removeprefix.sub('', lookupword)
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
try:
searchresult = self.html.find(lookupword.lower())
@@ -353,7 +353,7 @@ class HTMLPreProcessor(object):
(re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
# Center separator lines
- (re.compile(u'
\s*(?P([*#•]+\s*)+)\s*
'), lambda match: '\n
' + match.group(1) + '
'),
+ (re.compile(u'
\s*(?P([*#•✦]+\s*)+)\s*
'), lambda match: '\n
' + match.group(1) + '
'),
# Remove page links
(re.compile(r'', re.IGNORECASE), lambda match: ''),
@@ -363,13 +363,11 @@ class HTMLPreProcessor(object):
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
- # Detect Chapters to match default XPATH in GUI
- (re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*([ibu]>){0,2})\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*
)?', re.IGNORECASE), chap_head),
- # Cover the case where every letter in a chapter title is separated by a space
- (re.compile(r'
\s*(?P([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*(
))?'), chap_head),
+ # Convert line breaks to paragraphs
+ (re.compile(r'
]*>\s*'), lambda match : '
\n'),
+ (re.compile(r'
]*>\s*'), lambda match : '\n'),
+ (re.compile(r'\s*'), lambda match : '
\n'),
- # Have paragraphs show better
- (re.compile(r''), lambda match : ''),
# Clean up spaces
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics
@@ -455,9 +453,9 @@ class HTMLPreProcessor(object):
# delete soft hyphens - moved here so it's executed after header/footer removal
if is_pdftohtml:
# unwrap/delete soft hyphens
- end_rules.append((re.compile(u'[](\s*
)+\s*(?=[[a-z\d])'), lambda match: ''))
+ end_rules.append((re.compile(u'[](
\s*\s*)+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting
- end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(\s*
)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
+ end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(
\s*\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
# Make the more aggressive chapter marking regex optional with the preprocess option to
# reduce false positives and move after header/footer removal
@@ -475,7 +473,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*
\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
- (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+ (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?(i|b|u)>)?\s*(\s*\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
@@ -508,7 +506,15 @@ class HTMLPreProcessor(object):
if is_pdftohtml and length > -1:
# Dehyphenate
dehyphenator = Dehyphenator()
- html = dehyphenator(html,'pdf', length)
+ html = dehyphenator(html,'html', length)
+
+ if is_pdftohtml:
+ from calibre.ebooks.conversion.utils import PreProcessor
+ pdf_markup = PreProcessor(self.extra_opts, None)
+ totalwords = 0
+ totalwords = pdf_markup.get_word_count(html)
+ if totalwords > 7000:
+ html = pdf_markup.markup_chapters(html, totalwords, True)
#dump(html, 'post-preprocess')
@@ -554,5 +560,9 @@ class HTMLPreProcessor(object):
html = smartyPants(html)
html = html.replace(start, '')
+ # convert ellipsis to entities to prevent wrapping
+ html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
+ # convert double dashes to em-dash
+ html = re.sub('\s--\s', u'\u2014', html)
return substitute_entites(html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 11979b933c..1bb232c911 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -6,8 +6,10 @@ __copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
import re
+from math import ceil
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
+from calibre.utils.wordcount import get_wordcount_obj
class PreProcessor(object):
@@ -17,6 +19,9 @@ class PreProcessor(object):
self.found_indents = 0
self.extra_opts = extra_opts
+ def is_pdftohtml(self, src):
+ return '' in src[:1000]
+
def chapter_head(self, match):
chap = match.group('chap')
title = match.group('title')
@@ -64,7 +69,7 @@ class PreProcessor(object):
inspect. Percent is the minimum percent of line endings which should
be marked up to return true.
'''
- htm_end_ere = re.compile('
', re.DOTALL)
+ htm_end_ere = re.compile('(p|div)>', re.DOTALL)
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
htm_end = htm_end_ere.findall(raw)
line_end = line_end_ere.findall(raw)
@@ -101,36 +106,125 @@ class PreProcessor(object):
with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8'))
+ def get_word_count(self, html):
+ word_count_text = re.sub(r'(?s)]*>.*?', '', html)
+ word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
+ wordcount = get_wordcount_obj(word_count_text)
+ return wordcount.words
+
+ def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
+ # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
+ # minimum of chapters to search for
+ self.min_chapters = 1
+ if wordcount > 7000:
+ self.min_chapters = int(ceil(wordcount / 7000.))
+ #print "minimum chapters required are: "+str(self.min_chapters)
+ heading = re.compile(']*>', re.IGNORECASE)
+ self.html_preprocess_sections = len(heading.findall(html))
+ self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
+
+ # Build the Regular Expressions in pieces
+ init_lookahead = "(?=<(p|div))"
+ chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
+ title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
+ chapter_header_open = r"(?P"
+ title_header_open = r"(?P"
+ chapter_header_close = ")\s*"
+ title_header_close = ")"
+ chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>"
+ title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)>)?\s*(?P=outer2)>"
+
+ is_pdftohtml = self.is_pdftohtml(html)
+ if is_pdftohtml:
+ chapter_line_open = "<(?Pp)[^>]*>(\s*<[ibu][^>]*>)?\s*"
+ chapter_line_close = "\s*([ibu][^>]*>\s*)?(?P=outer)>"
+ title_line_open = "<(?Pp)[^>]*>\s*"
+ title_line_close = "\s*(?P=outer2)>"
+
+
+ if blanks_between_paragraphs:
+ blank_lines = "(\s*]*>\s*
){0,2}\s*"
+ else:
+ blank_lines = ""
+ opt_title_open = "("
+ opt_title_close = ")?"
+ n_lookahead_open = "\s+(?!"
+ n_lookahead_close = ")"
+
+ default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?([ibu][^>]*>)?(?=<)"
+
+ chapter_types = [
+ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
+ [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines
+ [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
+ [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
+ [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
+ [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon
+ [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
+ ]
+
+ # Start with most typical chapter headings, get more aggressive until one works
+ for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
+ if self.html_preprocess_sections >= self.min_chapters:
+ break
+ full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
+ n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+ self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+ if lookahead_ignorecase:
+ chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+ else:
+ chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
+ chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
+ html = chapdetect.sub(self.chapter_head, html)
+
+ words_per_chptr = wordcount
+ if words_per_chptr > 0 and self.html_preprocess_sections > 0:
+ words_per_chptr = wordcount / self.html_preprocess_sections
+ self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
+ return html
+
+
+
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
+ # Count the words in the document to estimate how many chapters to look for and whether
+ # other types of processing are attempted
+ totalwords = 0
+ totalwords = self.get_word_count(html)
+
+ if totalwords < 20:
+ self.log("not enough text, not preprocessing")
+ return html
+
# Arrange line feeds and tags so the line_length and no_markup functions work correctly
- html = re.sub(r"\s*", "\n", html)
- html = re.sub(r"\s*[^>]*)>\s*", "\n
"+">", html)
+ html = re.sub(r"\s*(?Pp|div)>", ""+"\g"+">\n", html)
+ html = re.sub(r"\s*<(?Pp|div)(?P\n"
+"Default pattern
\n"
+"\\[.+\\]
\n"
+"excludes tags of the form "
+"[tag],
\n"
+"e.g., [Project "
+"Gutenberg]
"
#: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi_ui.py:306
msgid "Excluded genres"
-msgstr ""
+msgstr "Xéneros excluídos"
#: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi_ui.py:307
#: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi_ui.py:310
msgid "Tags to &exclude"
-msgstr ""
+msgstr "Etiquetas a &Excluír"
#: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi_ui.py:308
msgid ""
"Books matching either pattern will not be included in generated catalog. "
msgstr ""
+"Os libros que coincidan con calquera filtro non se incluirán no catálogo "
+"xeral. "
#: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi_ui.py:309
msgid "Excluded books"
-msgstr ""
+msgstr "Libros excluídos"
#: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi_ui.py:311
msgid ""
@@ -5654,99 +5683,116 @@ msgid ""
"size:12pt;\">Default: ~,Catalog