diff --git a/Changelog.yaml b/Changelog.yaml index b50ae0e53c..b055de5142 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,273 @@ # new recipes: # - title: +- version: 0.8.50 + date: 2012-05-04 + + new features: + - title: "Tweak Book: Allow tweaking of KF8 MOBI files. Useful to fine-tune the result of a conversion. Right click on the book and select Tweak Book to use the feature. Note that tweaking a MOBI file that contains both KF8 and older MOBI6 will cause the MOBI6 version to be discarded." + + - title: "AZW3 output plugin. This output plugin generates pure KF8 mobi files. These only work on the Kindle Fire and Kindle Touch with latest firmware." + + - title: "Conversion: Allow easy re-ordering of the search and replace expressions in the conversion dialog. Also apply the expressions in the same order that they were entered when doing the conversion." + + - title: "Automatically add the Tag 'Sample Book' when an Amazon sample is added to calibre" + + - title: "FB2 Input: Better handling of inline images." + tickets: [989869] + + bug fixes: + - title: "KF8 Output: Fix section to section jumps not working for documents with multi-level ToCs" + + - title: "EPUB Input: Handle the case of the metadata ToC containing a reference to the cover HTML file." + tickets: [993812] + + - title: "CHM Input: Handle files with deeply nested markup and non html files listed at the start of the manifest." + tickets: [993607] + + - title: "KF8 Output: Workaround Kindle Touch bug that causes the book to be rendered as black pages when a height is specified for
" + + - title: "Fix regression in 0.8.49 that broke italics detection in heuristic processing on 32-bit systems." + tickets: [991380] + + - title: "KF8 Output: Fix joint MOBI6/KF8 books not being recognized as MOBI files by older Kindles" + + - title: "KF8 Output: Fix errors when processing documents with HTML comments and/or XML processing instructions" + + - title: "Get Books: Amazon fix prices not being found. B&N fix details link. ebooks.com: fix cover image. Website changes to various EU stores" + + - title: "FB2 Input: More robust base64 decoding to handle embedded images that are incorrectly encoded." + tickets: [990929] + + - title: "Fix scrolling with the cover browser updating only the selection in the book list, not the current book." + tickets: [990881] + + - title: "Save to Disk: Do not run out memory when saving very large files on systems with low RAM." + tickets: [990741] + + - title: "FB2 Output: Use 2 letter language codes in preference to 3-letter ones to not break poorly implemented FB2 readers" + tickets: [990026] + + - title: "EPUB Input: Auto set the media-type for OPF manifest entries with an empty media-type" + + improved recipes: + - National Post + - Daily Mirror + - Sun + - Newsweek Polska + - Max-Planck + - derStandard + - tweakers.net + + new recipes: + - title: George Monbiot + author: Darko Miletic + + - title: El Mundo + author: atordo + + - title: AraInfo and Diagonal + author: Ruben Pollan + + +- version: 0.8.49 + date: 2012-04-27 + + new features: + - title: "Experimental support for generating Amazon's new KF8 format MOBI files" + description: "calibre can now generate Amazon's new KF8 format MOBI files. + To turn on this feature, go to Preferences->Tweaks and click Plugin Tweaks. In the box add: + test_mobi_output_type = 'both' + calibre will now produce MOBI files that have both the old MOBI format and the new KF8 format in them. + To learn more about KF8, see: http://www.amazon.com/gp/feature.html?docId=1000729511 + Note that calibre support for KF8 is still experimental and there will likely be bugs." + + - title: "Upgrade to using cssutils 0.9.9 for CSS parsing. Improved speed and robustness." + + - title: "Show cover size in a tooltip in the conversion dialog" + tickets: [986958] + + - title: "Driver for Nook Simple Touch with Glow Light" + tickets: [989264] + + bug fixes: + - title: "Heuristics: When italicizing words do not operate on words not in between HTML tags." + tickets: [986298] + + - title: "Fix (I hope) the bulk metadata download process crashing for some people on OS X when clicking the Yes button to apply the updates." + tickets: [986658] + + - title: "Fix tooltip not being updated in the book details panel when pasting in a new cover" + tickets: [986958] + + - title: "Cover Browser: Wrap the title on space only, not in between words." + tickets: [986516] + + - title: "Edit metadata dialog: If a permission denied error occurs when clicking the next or prev buttons, stay on the current book." + tickets: [986903] + + - title: "Fix heuristics not removing unnecessary hyphens from the end of lines." + tickets: [822744] + + improved recipes: + - Metro Nieuws NL + - Der Tagesspiegel + + new recipes: + - title: Berria + author: Alayn Gortazar + + - title: Sol Haber + author: Onur Gungor + + - title: Telam + author: Darko Miletic + + - title: Richmond Times-Dispatch + author: jde + +- version: 0.8.48 + date: 2012-04-20 + + new features: + - title: "Conversion: The search and replace feature has been completely revamped." + description: "You can now use any number of search and replace + expression, not just three. You can also store and load frequently used + sets of search and replace expressions. Also, the wizard generates its + preview in a separate process to protect against crashes/memory leaks." + tickets: [983476,983484,983478] + + - title: "Support for the new '.azw3' files that Amazon recently started generating. calibre will now detect them as ebooks. It can also view/convert them, if they are DRM free." + + - title: "Drivers for Samsung Galaxy ACE GT-S5830L and HTC One X" + tickets: [981185] + + bug fixes: + - title: "Get Books: Support the new website design of Barnes & Noble" + + - title: "T1 driver: Fix books sent to SD card sometimes resulting problems when deleted." + tickets: [943586] + + - title: "Do not allow author names to be set to blank via the Manage authors function. Blank authors are now automatically set to 'Unknown'" + + - title: "MOBI Output: Handle background color specified onKomentarze | ", re.IGNORECASE), lambda m: '')] remove_tags_before= dict(name='td', attrs={'class':'main-bg'}) @@ -45,6 +46,19 @@ class Adventure_zone(BasicNewsRecipe): skip_tag = skip_tag.findAll(name='a') for r in skip_tag: if r.strong: - word=r.strong.string - if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)): - return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) \ No newline at end of file + word=r.strong.string.lower() + if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): + return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) + + def preprocess_html(self, soup): + footer=soup.find(attrs={'class':'news-footer middle-border'}) + if footer and len(footer('a'))>=2: + footer('a')[1].extract() + for item in soup.findAll(style=True): + del item['style'] + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + a['href']=self.index + a['href'] + return soup + + \ No newline at end of file diff --git a/recipes/ara_info.recipe b/recipes/ara_info.recipe new file mode 100644 index 0000000000..0345b9ed6e --- /dev/null +++ b/recipes/ara_info.recipe @@ -0,0 +1,19 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = 'Ruben Pollan 2K, it is used as the article.
+
+
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
@@ -182,12 +187,19 @@ class OReillyPremium(BasicNewsRecipe):
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
# this is used instead of BasicNewsRecipe.parse_feeds().
+ # it is called by download
def parse_index(self):
# Parse the page into Python Soup
+ print("Entering recipe print_index from:")
+ traceback.print_stack()
+ print("web")
baseURL = "https://www.billoreilly.com"
- return self.parseGeneric(baseURL)
+ masterList = self.parseGeneric(baseURL)
+ #print(masterList)
+ return masterList
def preprocess_html(self, soup):
+ print("In preprocess_html")
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
@@ -195,3 +207,128 @@ class OReillyPremium(BasicNewsRecipe):
raw = self.browser.open('https://www.billoreilly.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
+ def build_index(self):
+ print("In OReilly build_index()\n\n")
+ feedsRSS = []
+ self.report_progress(0, ('Fetching feeds...'))
+ #try:
+ feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
+ max_articles_per_feed=self.max_articles_per_feed,
+ log=self.log)
+ self.report_progress(0, ('Got feeds from index page'))
+ #except NotImplementedError:
+ # feeds = self.parse_feeds()
+ # Now add regular feeds.
+ feedsRSS = self.parse_feeds()
+ print ("feedsRSS is type "+feedsRSS.__class__.__name__)
+
+ for articles in feedsRSS:
+ print("articles is type "+articles.__class__.__name__)
+ print("Title:" + articles.title)
+ feeds.append(articles)
+ if not feeds:
+ raise ValueError('No articles found, aborting')
+
+ #feeds = FeedCollection(feeds)
+
+ self.report_progress(0, ('Trying to download cover...'))
+ self.download_cover()
+ self.report_progress(0, ('Generating masthead...'))
+ self.masthead_path = None
+
+ try:
+ murl = self.get_masthead_url()
+ except:
+ self.log.exception('Failed to get masthead url')
+ murl = None
+
+ if murl is not None:
+ # Try downloading the user-supplied masthead_url
+ # Failure sets self.masthead_path to None
+ self.download_masthead(murl)
+ if self.masthead_path is None:
+ self.log.info("Synthesizing mastheadImage")
+ self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
+ try:
+ self.default_masthead_image(self.masthead_path)
+ except:
+ self.log.exception('Failed to generate default masthead image')
+ self.masthead_path = None
+
+ if self.test:
+ feeds = feeds[:2]
+ self.has_single_feed = len(feeds) == 1
+
+ index = os.path.join(self.output_dir, 'index.html')
+
+ html = self.feeds2index(feeds)
+ with open(index, 'wb') as fi:
+ fi.write(html)
+
+ self.jobs = []
+
+ if self.reverse_article_order:
+ for feed in feeds:
+ if hasattr(feed, 'reverse'):
+ feed.reverse()
+
+ self.feed_objects = feeds
+ for f, feed in enumerate(feeds):
+ feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
+ if not os.path.isdir(feed_dir):
+ os.makedirs(feed_dir)
+
+ for a, article in enumerate(feed):
+ if a >= self.max_articles_per_feed:
+ break
+ art_dir = os.path.join(feed_dir, 'article_%d'%a)
+ if not os.path.isdir(art_dir):
+ os.makedirs(art_dir)
+ try:
+ url = self.print_version(article.url)
+ except NotImplementedError:
+ url = article.url
+ except:
+ self.log.exception('Failed to find print version for: '+article.url)
+ url = None
+ if not url:
+ continue
+ func, arg = (self.fetch_embedded_article, article) \
+ if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
+ else \
+ ((self.fetch_obfuscated_article if self.articles_are_obfuscated \
+ else self.fetch_article), url)
+ req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
+ {}, (f, a), self.article_downloaded,
+ self.error_in_article_download)
+ req.feed = feed
+ req.article = article
+ req.feed_dir = feed_dir
+ self.jobs.append(req)
+
+
+ self.jobs_done = 0
+ tp = ThreadPool(self.simultaneous_downloads)
+ for req in self.jobs:
+ tp.putRequest(req, block=True, timeout=0)
+
+
+ self.report_progress(0, ('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
+ while True:
+ try:
+ tp.poll()
+ time.sleep(0.1)
+ except NoResultsPending:
+ break
+ for f, feed in enumerate(feeds):
+ print("Writing feeds for "+feed.title)
+ html = self.feed2index(f,feeds)
+ feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
+ with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
+ fi.write(html)
+ self.create_opf(feeds)
+ self.report_progress(1, ('Feeds downloaded to %s')%index)
+
+ return index
+
+
diff --git a/recipes/orlando_sentinel.recipe b/recipes/orlando_sentinel.recipe
index 7a59f6f6ba..b327bc2b74 100644
--- a/recipes/orlando_sentinel.recipe
+++ b/recipes/orlando_sentinel.recipe
@@ -1,3 +1,4 @@
+import urllib, re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1279258912(BasicNewsRecipe):
@@ -27,12 +28,30 @@ class AdvancedUserRecipe1279258912(BasicNewsRecipe):
encoding = 'utf-8'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.orlandosentinel.com/media/graphic/2009-07/46844851.gif'
- keep_only_tags = [
- dict(name='div', attrs={'class':'story'})
- ]
- remove_tags = [
- dict(name='div', attrs={'class':['articlerail','tools','comment-group','clearfix']}),
- ]
- remove_tags_after = [
- dict(name='p', attrs={'class':'copyright'}),
- ]
+
+ auto_cleanup = True
+
+ def get_article_url(self, article):
+ ans = None
+ try:
+ s = article.summary
+ ans = urllib.unquote(
+ re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
+ except:
+ pass
+ if ans is None:
+ link = article.get('feedburner_origlink', None)
+ if link and link.split('/')[-1]=="story01.htm":
+ link=link.split('/')[-2]
+ encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
+ '0D': '?', '0E': '-', '0N': '.com', '0L': 'http:',
+ '0S':'//'}
+ for k, v in encoding.iteritems():
+ link = link.replace(k, v)
+ ans = link
+ elif link:
+ ans = link
+ if ans is not None:
+ return ans.replace('?track=rss', '')
+
+
diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe
index 952db30c3e..56bb601f70 100644
--- a/recipes/pc_arena.recipe
+++ b/recipes/pc_arena.recipe
@@ -7,6 +7,7 @@ class PC_Arena(BasicNewsRecipe):
description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
category = 'IT'
language = 'pl'
+ index='http://pcarena.pl'
masthead_url='http://pcarena.pl/pcarena/img/logo.png'
cover_url= 'http://pcarena.pl/pcarena/img/logo.png'
no_stylesheets = True
@@ -22,4 +23,10 @@ class PC_Arena(BasicNewsRecipe):
if 'http' not in url:
return 'http://pcarena.pl' + url
else:
- return url
\ No newline at end of file
+ return url
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/readitlater.recipe b/recipes/readitlater.recipe
index 38f7ec1a9a..92c9aaf9d6 100644
--- a/recipes/readitlater.recipe
+++ b/recipes/readitlater.recipe
@@ -1,5 +1,5 @@
"""
-readitlaterlist.com
+Pocket Calibre Recipe v1.0
"""
__license__ = 'GPL v3'
__copyright__ = '''
@@ -12,22 +12,23 @@ from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
-class Readitlater(BasicNewsRecipe):
- title = 'ReadItLater'
+class Pocket(BasicNewsRecipe):
+ title = 'Pocket'
__author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
- description = '''Personalized news feeds. Go to readitlaterlist.com to setup \
- up your news. This version displays pages of articles from \
+ description = '''Personalized news feeds. Go to getpocket.com to setup up \
+ your news. This version displays pages of articles from \
oldest to newest, with max & minimum counts, and marks articles \
read after downloading.'''
- publisher = 'readitlaterlist.com'
+ publisher = 'getpocket.com'
category = 'news, custom'
oldest_article = 7
max_articles_per_feed = 50
- minimum_articles = 1
+ minimum_articles = 10
+ mark_as_read_after_dl = True
no_stylesheets = True
use_embedded_content = False
needs_subscription = True
- INDEX = u'http://readitlaterlist.com'
+ INDEX = u'http://getpocket.com'
LOGIN = INDEX + u'/l'
readList = []
@@ -100,9 +101,31 @@ class Readitlater(BasicNewsRecipe):
br = self.get_browser()
for link in markList:
url = self.INDEX + link
+ print 'Marking read: ', url
response = br.open(url)
- response
+ print response.info()
def cleanup(self):
- self.mark_as_read(self.readList)
+ if self.mark_as_read_after_dl:
+ self.mark_as_read(self.readList)
+ else:
+ pass
+ def default_cover(self, cover_file):
+ '''
+ Create a generic cover for recipes that don't have a cover
+ This override adds time to the cover
+ '''
+ try:
+ from calibre.ebooks import calibre_cover
+ title = self.title if isinstance(self.title, unicode) else \
+ self.title.decode('utf-8', 'replace')
+ date = strftime(self.timefmt)
+ time = strftime('[%I:%M %p]')
+ img_data = calibre_cover(title, date, time)
+ cover_file.write(img_data)
+ cover_file.flush()
+ except:
+ self.log.exception('Failed to generate default cover')
+ return False
+ return True
diff --git a/recipes/real_clear.recipe b/recipes/real_clear.recipe
index 19add74fcd..cbf5a2f8e4 100644
--- a/recipes/real_clear.recipe
+++ b/recipes/real_clear.recipe
@@ -1,5 +1,7 @@
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
+import re
import time
+from urlparse import urlparse
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString
@@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe):
# Don't go down
recursions = 0
max_articles_per_feed = 400
- debugMessages = False
+ debugMessages = True
# Numeric parameter is type, controls whether we look for
feedsets = [
- ["Politics", "http://www.realclearpolitics.com/index.xml", 0],
- ["Science", "http://www.realclearscience.com/index.xml", 0],
+ ["Politics", "http://www.realclearpolitics.com/index.xml", 0],
+ ["Policy", "http://www.realclearpolicy.com/index.xml", 0],
+ ["Science", "http://www.realclearscience.com/index.xml", 0],
["Tech", "http://www.realcleartechnology.com/index.xml", 0],
# The feedburner is essentially the same as the top feed, politics.
# ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
@@ -37,7 +40,9 @@ class RealClear(BasicNewsRecipe):
]
# Hints to extractPrintURL.
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
- printhints = [
+ phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
+
+ printhints = [ ["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'],
@@ -48,11 +53,24 @@ class RealClear(BasicNewsRecipe):
# usatoday - just prints with all current crap anyhow
]
+ # RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
+ # The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
+ # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
+ # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
+ # Use the FULL PRINTPAGE URL; it formats it better too!
+ #
+ # NYT - try single page...
+ # Need special code - is it one page or several? Which URL?
+ # from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
+ # to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
+ # which is at link rel="canonical" and at 0 and len(self.printhints[x][1]) == 0:
+ if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
+ # e.g. RealClear
if self.debugMessages == True :
- print("search1")
+ print("Search by href: "+self.printhints[x][self.phHrefSearch])
+ printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
+ elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
+ if self.debugMessages == True :
+ print("Search 1: "+self.printhints[x][2]+" Attributes: ")
+ print(self.printhints[x][3])
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 :
if self.debugMessages == True :
print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else :
+ if self.debugMessages == True:
+ print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None:
if self.debugMessages == True :
print("Not Found")
+ # print(soup)
+ print("end soup\n\n");
continue
+
print(printFind)
if isinstance(printFind, NavigableString)==False:
if printFind['href'] is not None:
+ print("Check "+printFind['href']+" for base of "+baseURL)
+ if printFind['href'].find("http")!=0 :
+ return baseURL+printFind['href']
return printFind['href']
tag = printFind.parent
print(tag)
@@ -158,6 +190,7 @@ class RealClear(BasicNewsRecipe):
def parse_index(self):
# Parse the page into Python Soup
+ #articleList = []
ans = []
feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4
@@ -168,3 +201,4 @@ class RealClear(BasicNewsRecipe):
print(ans)
return ans
+
diff --git a/recipes/rebelion.recipe b/recipes/rebelion.recipe
new file mode 100644
index 0000000000..a01acc6204
--- /dev/null
+++ b/recipes/rebelion.recipe
@@ -0,0 +1,34 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+
+class RebelionRecipe (BasicNewsRecipe):
+ __author__ = u'Marc Busqué '+title+'') + f.write(str(post.findNextSibling('p'))) + f.write(u'') + self.log('\tWrote note to', f.name) + f.close() + self.tempfiles.append(f) + articles.append({'title':title, 'url':url, 'date':self.pubdate}) + + return articles + + def postprocess_html(self, soup, first): + for table in soup.findAll('table', align='right'): + img = table.find('img') + if img is not None: + img.extract() + caption = self.tag_to_string(table).strip() + div = Tag(soup, 'div') + div['style'] = 'text-align:center' + div.insert(0, img) + div.insert(1, Tag(soup, 'br')) + if caption: + div.insert(2, NavigableString(caption)) + table.replaceWith(div) + + return soup + + def image_url_processor(self, baseurl, url): + return url.replace(' ','%20') + + def cleanup(self): + self.log('cleaning up') + for f in self.tempfiles: + os.unlink(f.name) + self.tempfiles = [] diff --git a/recipes/swiat_obrazu.recipe b/recipes/swiat_obrazu.recipe new file mode 100644 index 0000000000..68740fa4dd --- /dev/null +++ b/recipes/swiat_obrazu.recipe @@ -0,0 +1,25 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Swiat_Obrazu(BasicNewsRecipe): + title = u'Swiat Obrazu' + __author__ = 'fenuks' + description = u'Internetowy Dziennik o Fotografii i Wideo www.SwiatObrazu.pl to źródło informacji o technice fotografii i wideo, o sprzęcie najbardziej znanych i uznanych firm: Canon, Nikon, Sony, Hasselblad i wielu innych. Znajdziecie tu programy do obróbki zdjęć, forum foto i forum wideo i galerie zdjęć. Codziennie najświeższe informacje: aktualności, testy, poradniki, wywiady, felietony. Swiatobrazu.pl stale organizuje konkursy oraz warsztaty fotograficzne i wideo.' + category = 'photography' + masthead_url = 'http://www.swiatobrazu.pl/img/logo.jpg' + cover_url = 'http://www.swiatobrazu.pl/img/logo.jpg' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_javascript= True + use_embedded_content = False + feeds = [(u'Wszystko', u'http://www.swiatobrazu.pl/rss')] + + def print_version(self, url): + return url + ',drukuj' + + def image_url_processor(self, baseurl, url): + if 'http://' not in url or 'https://' not in url: + return 'http://www.swiatobrazu.pl' + url[5:] + else: + return url diff --git a/recipes/tagesspiegel.recipe b/recipes/tagesspiegel.recipe index 92d88d56ae..71191065f1 100644 --- a/recipes/tagesspiegel.recipe +++ b/recipes/tagesspiegel.recipe @@ -34,7 +34,7 @@ class TagesspiegelRSS(BasicNewsRecipe): no_javascript = True remove_empty_feeds = True encoding = 'utf-8' - remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-date hcf-separate'}] + remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-colon'}, {'class':'hcf-date hcf-separate'}] def print_version(self, url): url = url.split('/') @@ -51,6 +51,7 @@ class TagesspiegelRSS(BasicNewsRecipe): return ''.join(div.findAll(text=True, recursive=False)).strip() if div is not None else None articles = {} + links = set() key = None ans = [] maincol = soup.find('div', attrs={'class':re.compile('hcf-main-col')}) @@ -59,7 +60,7 @@ class TagesspiegelRSS(BasicNewsRecipe): if div['class'] == 'hcf-header': try: - key = string.capwords(feed_title(div.em.a)) + key = string.capwords(feed_title(div.em)) articles[key] = [] ans.append(key) except: @@ -70,6 +71,12 @@ class TagesspiegelRSS(BasicNewsRecipe): if not a: continue url = 'http://www.tagesspiegel.de' + a['href'] + + # check for duplicates + if url in links: + continue + links.add(url) + title = self.tag_to_string(a, use_alt=True).strip() description = '' pubdate = strftime('%a, %d %b') diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe index 666cb8aa77..a615763307 100644 --- a/recipes/tanuki.recipe +++ b/recipes/tanuki.recipe @@ -34,4 +34,12 @@ class tanuki(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) + for a in soup('a'): + if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + if 'tanuki-anime' in soup.title.string.lower(): + a['href']='http://anime.tanuki.pl' + a['href'] + elif 'tanuki-manga' in soup.title.string.lower(): + a['href']='http://manga.tanuki.pl' + a['href'] + elif 'tanuki-czytelnia' in soup.title.string.lower(): + a['href']='http://czytelnia.tanuki.pl' + a['href'] return soup \ No newline at end of file diff --git a/recipes/telam.recipe b/recipes/telam.recipe new file mode 100644 index 0000000000..c2dbfee1d7 --- /dev/null +++ b/recipes/telam.recipe @@ -0,0 +1,62 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic', re.IGNORECASE | re.DOTALL), lambda match: '')]
+
+
keep_only_tags = [
dict(name='h1'),dict(name='h2',attrs={'class' : 'medium centered'}),
dict(name='div',attrs={'class' : 'text-center'}),
dict(name='div',attrs={'id' : 'bodyText'})
# dict(name='p')
]
-
remove_tags=[
#dict(name='head'),
dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
@@ -46,12 +48,46 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
]
+
feeds = [
- (u'News','http://feed43.com/2517447382644748.xml'),
- (u'Sport', u'http://feed43.com/4283846255668687.xml'),
- (u'Bizarre', u'http://feed43.com/0233840304242011.xml'),
- (u'Film',u'http://feed43.com/1307545221226200.xml'),
- (u'Music',u'http://feed43.com/1701513435064132.xml'),
- (u'Sun Woman',u'http://feed43.com/0022626854226453.xml'),
+ (u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
+ (u'Sport', u'http://www.thesun.co.uk/sol/homepage/sport/rss'),
+ (u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
+ (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),
]
+ def get_cover_url(self):
+ soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
+ # look for the block containing the sun button and url
+ cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})
+
+ #cov = soup.find(attrs={'id' : 'large'})
+ cov2 = str(cov)
+
+ cov2='http://www.politicshome.com'+cov2[9:-133]
+ #cov2 now contains url of the page containing pic
+ #cov2 now contains url of the page containing pic
+ soup = self.index_to_soup(cov2)
+ cov = soup.find(attrs={'id' : 'large'})
+ cov2 = str(cov)
+ cov2=cov2[27:-18]
+ #cov2 now is pic url, now go back to original function
+
+ br = browser()
+ br.set_handle_redirect(False)
+ try:
+ br.open_novisit(cov2)
+ cover_url = cov2
+ except:
+ cover_url = random.choice((
+ 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
+ ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage7_677962a_905505a.jpg'
+ ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
+ ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
+ ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
+ ))
+
+ return cover_url
+
+
+
diff --git a/recipes/tpm_uk.recipe b/recipes/tpm_uk.recipe
index aa042de951..0ccad32fa9 100644
--- a/recipes/tpm_uk.recipe
+++ b/recipes/tpm_uk.recipe
@@ -11,6 +11,8 @@ class TPM_uk(BasicNewsRecipe):
__author__ = 'Darko Miletic'
description = 'Title says it all'
publisher = "The Philosophers' Magazine"
+ recipe_disabled = ('This recipe has been disabled as the website has'
+ ' started providing articles only in PDF form')
category = 'philosophy, news'
oldest_article = 25
max_articles_per_feed = 200
diff --git a/recipes/trouw.recipe b/recipes/trouw.recipe
index c7ad3e8ad1..294595b547 100644
--- a/recipes/trouw.recipe
+++ b/recipes/trouw.recipe
@@ -1,71 +1,12 @@
-#!/usr/bin/python
-
from calibre.web.feeds.news import BasicNewsRecipe
-class Trouw(BasicNewsRecipe):
+class BasicUserRecipe1333905513(BasicNewsRecipe):
title = u'Trouw'
- __author__ = u'JvdW'
+ __author__ = 'asalet_r'
+ language = 'nl'
description = u'Trouw de Verdieping'
- oldest_article = 7
+ oldest_article = 1
max_articles_per_feed = 25
- language = u'nl'
- simultaneous_downloads = 1
- delay = 1
-# timefmt = ' [%A, %d %B, %Y]'
- timefmt = ''
- no_stylesheets = True
- cover_url = 'http://www.trouw.nl/template/ver2-0/images/trouw_logo.gif'
+ auto_cleanup = True
-# keep_only_tags = [ dict(name='div', attrs={'id':'content'}) ]
- remove_tags = [
- dict(name='div', attrs={'id' :'leaderboard' })
- ,dict(name='div', attrs={'class':'banner' })
- ,dict(name='div', attrs={'id' :'header' })
- ,dict(name='div', attrs={'class':'options' })
- ,dict(name='div', attrs={'id' :'menu_main' })
- ,dict(name='div', attrs={'id' :'menu_sub' })
- ,dict(name='div', attrs={'id' :'column_right' })
- ,dict(name='div', attrs={'class':'meta_information'})
- ,dict(name='div', attrs={'id' :'comments_form' })
- ,dict(name='div', attrs={'id' :'mailfriend' })
- ,dict(name='div', attrs={'id' :'footer' })
- ,dict(name='img', attrs={'id' :'dot_clear' })
- ]
-
- keep_only_tags = [dict(id=['columns'])]
-
- feeds = [
- (u'Algemen', u'http://www.trouw.nl/?service=rss'),
- (u'Nederland', u'http://www.trouw.nl/nieuws/nederland/?service=rss'),
- (u'Europa', u'http://www.trouw.nl/nieuws/europa/?service=rss'),
- (u'Wereld', u'http://www.trouw.nl/nieuws/wereld/?service=rss'),
- (u'Economie', u'http://www.trouw.nl/nieuws/economie/?service=rss'),
- (u'Wetenschap', u'http://www.trouw.nl/nieuws/Wetenschap/?service=rss'),
- (u'Groen', u'http://www.trouw.nl/groen/?service=rss'),
- (u'Religie en Filosofie', u'http://www.trouw.nl/religie-filosofie/?service=rss'),
- (u'Politiek', u'http://www.trouw.nl/nieuws/politiek/?service=rss'),
- (u'Zorg', u'http://www.trouw.nl/nieuws/zorg/?service=rss'),
- (u'Onderwijs', u'http://www.trouw.nl/onderwijs/nieuws/?service=rss'),
- (u'Sport', u'http://www.trouw.nl/nieuws/sport/?service=rss'),
- (u'Achtergrond', u'http://www.trouw.nl/achtergrond/?service=rss'),
- (u'De Verdieping', u'http://www.trouw.nl/achtergrond/deverdieping/?service=rss'),
- (u'Naschrift', u'http://www.trouw.nl/achtergrond/Naschrift/?service=rss'),
- (u'Opinie', u'http://www.trouw.nl/opinie/?service=rss'),
- (u'Podium', u'http://www.trouw.nl/opinie/podium/?service=rss'),
- (u'Commentaar', u'http://www.trouw.nl/opinie/commentaar/?service=rss'),
- (u'Cultuur', u'http://www.trouw.nl/cultuur/?service=rss'),
- (u'Boeken', u'http://www.trouw.nl/cultuur/boeken/?service=rss'),
- (u'Film', u'http://www.trouw.nl/cultuur/film/?service=rss'),
- (u'Beeldende kunst', u'http://www.trouw.nl/cultuur/beeldendekunst/?service=rss'),
- (u'Theater', u'http://www.trouw.nl/cultuur/theater/?service=rss'),
- (u'Muziek', u'http://www.trouw.nl/cultuur/muziek/?service=rss'),
- (u'Kinderen', u'http://www.trouw.nl/cultuur/kinderen/?service=rss'),
- (u'Ontspanning', u'http://www.trouw.nl/ontspanning/?service=rss'),
- (u'De Gids', u'http://www.trouw.nl/ontspanning/degids/?service=rss'),
- (u'Moderne manieren', u'http://www.trouw.nl/ontspanning/modernemanieren/?service=rss'),
- (u'Reizen', u'http://www.trouw.nl/ontspanning/reizen/?service=rss'),
- (u'Koken', u'http://www.trouw.nl/ontspanning/koken/?service=rss')
- ]
-
- def print_version(self, url):
- return url + '?all=true'
+ feeds = [(u'Nederland', u'http://www.trouw.nl/nieuws/nederland/rss.xml'), (u'Buitenland', u'http://www.trouw.nl/nieuws/buitenland/rss.xml'), (u'Politiek', u'http://www.trouw.nl/nieuws/politiek/rss.xml'), (u'Economie', u'http://www.trouw.nl/nieuws/economie/rss.xml'), (u'Sport', u'http://www.trouw.nl/nieuws/sport/rss.xml'), (u'Cultuur', u'http://www.trouw.nl/nieuws/cultuur/rss.xml'), (u'Gezondheid', u'http://www.trouw.nl/nieuws/gezondheid/rss.xml'), (u'Onderwijs', u'http://www.trouw.nl/nieuws/onderwijs/rss.xml'), (u'Opinie', u'http://www.trouw.nl/opinie/rss.xml'), (u'Groen', u'http://www.trouw.nl/groen/rss.xml'), (u'Religie-Filosofie', u'http://www.trouw.nl/religie-filosofie/rss.xml'), (u'Schrijf', u'http://www.trouw.nl/schrijf/rss.xml'), (u'Moderne Manieren', u'http://www.trouw.nl/moderne-manieren/rss.xml')]
diff --git a/recipes/tweakers_net.recipe b/recipes/tweakers_net.recipe
index f9bbe27ec9..e285d43e2e 100644
--- a/recipes/tweakers_net.recipe
+++ b/recipes/tweakers_net.recipe
@@ -2,65 +2,50 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
+''' Changelog
+ 2012-04-27 DrMerry:
+ Added cover picture
+ removed some extra tags
+'''
+
__license__ = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal ', re.IGNORECASE | re.DOTALL), lambda match : ''), - (re.compile(r' ', re.IGNORECASE | re.DOTALL), lambda match : ''), - (re.compile(r' ', re.IGNORECASE | re.DOTALL), lambda match : ''), - (re.compile(r'- tags! - return str(soup) + try: + return str(soup) + except RuntimeError: + return data def Contents(self): if self._contents is not None: diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 559402ca1c..877b15c24a 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -156,9 +156,10 @@ def add_pipeline_options(parser, plumber): 'SEARCH AND REPLACE' : ( _('Modify the document text and structure using user defined patterns.'), [ - 'sr1_search', 'sr1_replace', - 'sr2_search', 'sr2_replace', - 'sr3_search', 'sr3_replace', + 'sr1_search', 'sr1_replace', + 'sr2_search', 'sr2_replace', + 'sr3_search', 'sr3_replace', + 'search_replace', ] ), @@ -211,6 +212,7 @@ def add_pipeline_options(parser, plumber): if rec.level < rec.HIGH: option_recommendation_to_cli_option(add_option, rec) + def option_parser(): parser = OptionParser(usage=USAGE) parser.add_option('--list-recipes', default=False, action='store_true', @@ -271,6 +273,34 @@ def abspath(x): return x return os.path.abspath(os.path.expanduser(x)) +def read_sr_patterns(path, log=None): + import json, re, codecs + pats = [] + with codecs.open(path, 'r', 'utf-8') as f: + pat = None + for line in f.readlines(): + if line.endswith(u'\n'): + line = line[:-1] + + if pat is None: + if not line.strip(): + continue + try: + re.compile(line) + except: + msg = u'Invalid regular expression: %r from file: %r'%( + line, path) + if log is not None: + log.error(msg) + raise SystemExit(1) + else: + raise ValueError(msg) + pat = line + else: + pats.append((pat, line)) + pat = None + return json.dumps(pats) + def main(args=sys.argv): log = Log() parser, plumber = create_option_parser(args, log) @@ -278,6 +308,9 @@ def main(args=sys.argv): for x in ('read_metadata_from_opf', 'cover'): if getattr(opts, x, None) is not None: setattr(opts, x, abspath(getattr(opts, x))) + if opts.search_replace: + opts.search_replace = read_sr_patterns(opts.search_replace, log) + recommendations = [(n.dest, getattr(opts, n.dest), OptionRecommendation.HIGH) \ for n in parser.options_iter() diff --git a/src/calibre/ebooks/conversion/plugins/epub_input.py b/src/calibre/ebooks/conversion/plugins/epub_input.py index 77da1bb290..bb60412e1b 100644 --- a/src/calibre/ebooks/conversion/plugins/epub_input.py +++ b/src/calibre/ebooks/conversion/plugins/epub_input.py @@ -65,6 +65,7 @@ class EPUBInput(InputFormatPlugin): return False def rationalize_cover(self, opf, log): + removed = None from lxml import etree guide_cover, guide_elem = None, None for guide_elem in opf.iterguide(): @@ -91,6 +92,7 @@ class EPUBInput(InputFormatPlugin): # specially if not self.for_viewer: spine[0].getparent().remove(spine[0]) + removed = guide_cover guide_elem.set('href', 'calibre_raster_cover.jpg') from calibre.ebooks.oeb.base import OPF t = etree.SubElement(elem[0].getparent(), OPF('item'), @@ -109,6 +111,7 @@ class EPUBInput(InputFormatPlugin): if renderer is not None: open('calibre_raster_cover.jpg', 'wb').write( renderer) + return removed def find_opf(self): from lxml import etree @@ -170,7 +173,7 @@ class EPUBInput(InputFormatPlugin): for elem in opf.iterguide(): elem.set('href', delta+elem.get('href')) - self.rationalize_cover(opf, log) + self.removed_cover = self.rationalize_cover(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): @@ -198,3 +201,17 @@ class EPUBInput(InputFormatPlugin): nopf.write(opf.render()) return os.path.abspath(u'content.opf') + + def postprocess_book(self, oeb, opts, log): + rc = getattr(self, 'removed_cover', None) + if rc: + cover_toc_item = None + for item in oeb.toc.iterdescendants(): + if item.href == rc: + cover_toc_item = item + break + spine = {x.href for x in oeb.spine} + if (cover_toc_item is not None and cover_toc_item not in spine): + oeb.toc.item_that_refers_to_cover = cover_toc_item + + diff --git a/src/calibre/ebooks/conversion/plugins/epub_output.py b/src/calibre/ebooks/conversion/plugins/epub_output.py index 45df8ba9d1..0da2868969 100644 --- a/src/calibre/ebooks/conversion/plugins/epub_output.py +++ b/src/calibre/ebooks/conversion/plugins/epub_output.py @@ -312,13 +312,9 @@ class EPUBOutput(OutputFormatPlugin): Perform various markup transforms to get the output to render correctly in the quirky ADE. ''' - from calibre.ebooks.oeb.base import XPath, XHTML, OEB_STYLES, barename, urlunquote + from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote - stylesheet = None - for item in self.oeb.manifest: - if item.media_type.lower() in OEB_STYLES: - stylesheet = item - break + stylesheet = self.oeb.manifest.main_stylesheet # ADE cries big wet tears when it encounters an invalid fragment # identifier in the NCX toc. diff --git a/src/calibre/ebooks/conversion/plugins/fb2_input.py b/src/calibre/ebooks/conversion/plugins/fb2_input.py index b0d6a8b0ae..e1e619600d 100644 --- a/src/calibre/ebooks/conversion/plugins/fb2_input.py +++ b/src/calibre/ebooks/conversion/plugins/fb2_input.py @@ -5,13 +5,13 @@ __copyright__ = '2008, Anatoly Shipitsin |