Merge from trunk

This commit is contained in:
Charles Haley 2013-04-22 16:42:03 +02:00
commit bedcfdbbc9
235 changed files with 55529 additions and 40162 deletions

View File

@ -40,6 +40,7 @@ recipes/.gitignore
recipes/README.md recipes/README.md
recipes/icon_checker.py recipes/icon_checker.py
recipes/readme_updater.py recipes/readme_updater.py
recipes/garfield.recipe
recipes/katalog_egazeciarz.recipe recipes/katalog_egazeciarz.recipe
recipes/tv_axnscifi.recipe recipes/tv_axnscifi.recipe
recipes/tv_comedycentral.recipe recipes/tv_comedycentral.recipe
@ -63,6 +64,7 @@ recipes/tv_tvppolonia.recipe
recipes/tv_tvpuls.recipe recipes/tv_tvpuls.recipe
recipes/tv_viasathistory.recipe recipes/tv_viasathistory.recipe
recipes/icons/katalog_egazeciarz.png recipes/icons/katalog_egazeciarz.png
recipes/icons/garfield.png
recipes/icons/tv_axnscifi.png recipes/icons/tv_axnscifi.png
recipes/icons/tv_comedycentral.png recipes/icons/tv_comedycentral.png
recipes/icons/tv_discoveryscience.png recipes/icons/tv_discoveryscience.png

View File

@ -79,13 +79,6 @@ License: GPL2+
The full text of the GPL is distributed as in The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL-2 on Debian systems. /usr/share/common-licenses/GPL-2 on Debian systems.
Files: src/pyPdf/*
Copyright: Copyright (c) 2006, Mathieu Fenniak
Copyright: Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
License: BSD
The full text of the BSD license is distributed as in
/usr/share/common-licenses/BSD on Debian systems.
Files: src/calibre/utils/lzx/* Files: src/calibre/utils/lzx/*
Copyright: Copyright (C) 2002, Matthew T. Russotto Copyright: Copyright (C) 2002, Matthew T. Russotto
Copyright: Copyright (C) 2008, Marshall T. Vandegrift <llasram@gmail.com> Copyright: Copyright (C) 2008, Marshall T. Vandegrift <llasram@gmail.com>
@ -100,49 +93,6 @@ License: BSD
The full text of the BSD license is distributed as in The full text of the BSD license is distributed as in
/usr/share/common-licenses/BSD on Debian systems. /usr/share/common-licenses/BSD on Debian systems.
Files: src/calibre/utils/pyparsing.py
Copyright: Copyright (c) 2003-2008, Paul T. McGuire
License: MIT
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
Files: src/calibre/utils/PythonMagickWand.py
Copyright: (c) 2007 - Achim Domma - domma@procoders.net
License: MIT
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
Files: src/calibre/utils/msdes/d3des.h: Files: src/calibre/utils/msdes/d3des.h:
Files: src/calibre/utils/msdes/des.c: Files: src/calibre/utils/msdes/des.c:
Copyright: Copyright (C) 1988,1989,1990,1991,1992, Richard Outerbridge Copyright: Copyright (C) 1988,1989,1990,1991,1992, Richard Outerbridge

View File

@ -1,4 +1,4 @@
# vim:fileencoding=UTF-8:ts=2:sw=2:sta:et:sts=2:ai # vim:fileencoding=utf-8:ts=2:sw=2:sta:et:sts=2:ai
# Each release can have new features and bug fixes. Each of which # Each release can have new features and bug fixes. Each of which
# must have a title and can optionally have linked tickets and a description. # must have a title and can optionally have linked tickets and a description.
# In addition they can have a type field which defaults to minor, but should be major # In addition they can have a type field which defaults to minor, but should be major
@ -20,6 +20,118 @@
# new recipes: # new recipes:
# - title: # - title:
- version: 0.9.27
date: 2013-04-12
new features:
- title: "Metadata download: Add two new sources for covers: Google Image Search and bigbooksearch.com."
description: "To enable them go to Preferences->Metadata download and enable the 'Google Image' and 'Big Book Search' sources. Google Images is useful for finding larger covers as well as alternate versions of the cover. Big Book Search searches for alternate covers from amazon.com. It can occasionally find nicer covers than the direct Amazon source. Note that both these sources download multiple covers for a single book. Some of these covers can be wrong (i.e. they may be of a different book or not covers at all, so you should inspect the results and manually pick the best match). When bulk downloading, these sources are only used if the other sources find no covers."
type: major
- title: "Content server: Allow specifying a restriction to use for the server when embedding it as a WSGI app."
tickets: [1167951]
- title: "Get Books: Add a plugin for the Koobe Polish book store"
- title: "calibredb add_format: Add an option to not replace existing formats. Also pep8 compliance."
- title: "Allow restoring of the ORIGINAL_XXX format by right-clicking it in the book details panel"
bug fixes:
- title: "AZW3 Input: Do not fail to identify JPEG images with 8BIM headers created with Adobe Photoshop."
tickets: [1167985]
- title: "Amazon metadata download: Ignore Spanish edition entries when searching for a book on amazon.com"
- title: "TXT Input: When converting a txt file with a Byte Order Mark, remove the Byte Order Mark before further processing as it can cause the first line of the text to be mis-interpreted."
- title: "Get Books: Fix searching for current book/title/author by right clicking the get books icon"
- title: "Get Books: Update nexto, gutenberg, and virtualo store plugins for website changes"
- title: "Amazon metadata download: When downloading from amazon.co.jp handle the 'Black curtain redirect' for adult titles."
tickets: [1165628]
- title: "When extracting zip files do not allow maliciously created zip files to overwrite other files on the system"
- title: "RTF Input: Handle RTF files with invalid border style specifications"
tickets: [1021270]
improved recipes:
- The Escapist
- San Francisco Chronicle
- The Onion
- Fronda
- Tom's Hardware
- New Yorker
- Financial Times UK
- Business Week Magazine
- Victoria Times
- tvxs
- The Independent
new recipes:
- title: Economia
author: Manish Bhattarai
- title: Universe Today
author: seird
- title: The Galaxy's Edge
author: Krittika Goyal
- version: 0.9.26
date: 2013-04-05
new features:
- title: "PDF Output: Allow using templates to create arbitrary headers and footers. Look under PDF Output in the conversion dialog for this feature."
- title: "ToC Editor: Allow generating the ToC directly from individual files inside the ebook. Useful for EPUBs that have individual chapters in single files."
tickets: [1163520]
- title: "ToC Editor: Add buttons to indent/unindent the current entry"
- title: "ToC Editor: Right-click menu to perform various useful actions on entries in the ToC"
- title: "Column icons: Allow use of wide images as column icons"
- title: "Add USB ids for the Palm Pre2 and Samsung Galaxy phone to the device drivers"
tickets: [1162293,1163115]
bug fixes:
- title: "PDF Output: Fix generating page numbers causing links to not work."
tickets: [1162573]
- title: "Wrong filename output in error message when 'Guide reference not found'"
tickets: [1163659]
- title: "Get Books: Update Amazon, Barnes & Noble, Waterstones and Gutenberg store plugins for website change"
- title: "PDF Output: Fix 1 pixel wide left and top margins on the cover page for some PDF conversions due to incorrect rounding."
tickets: [1162054]
- title: "ToC Editor: Fix drag and drop of multiple items resulting in the dropped items being in random order sometimes."
tickets: [1161999]
improved recipes:
- Financial Times UK
- Sing Tao Daily
- Apple Daily
- A List Apart
- Business Week
- Harpers printed edition
- Harvard Business Review
new recipes:
- title: AM730
author: Eddie Lau
- title: Arret sur images
author: Francois D
- title: Diario de Noticias
author: Jose Pinto
- version: 0.9.25 - version: 0.9.25
date: 2013-03-29 date: 2013-03-29

View File

@ -436,8 +436,8 @@ generate a Table of Contents in the converted ebook, based on the actual content
.. note:: Using these options can be a little challenging to get exactly right. .. note:: Using these options can be a little challenging to get exactly right.
If you prefer creating/editing the Table of Contents by hand, convert to If you prefer creating/editing the Table of Contents by hand, convert to
the EPUB or AZW3 formats and select the checkbox at the bottom of the the EPUB or AZW3 formats and select the checkbox at the bottom of the Table
screen that says of Contents section of the conversion dialog that says
:guilabel:`Manually fine-tune the Table of Contents after conversion`. :guilabel:`Manually fine-tune the Table of Contents after conversion`.
This will launch the ToC Editor tool after the conversion. It allows you to This will launch the ToC Editor tool after the conversion. It allows you to
create entries in the Table of Contents by simply clicking the place in the create entries in the Table of Contents by simply clicking the place in the

View File

@ -647,12 +647,17 @@ computers. Run |app| on a single computer and access it via the Content Server
or a Remote Desktop solution. or a Remote Desktop solution.
If you must share the actual library, use a file syncing tool like If you must share the actual library, use a file syncing tool like
DropBox or rsync or Microsoft SkyDrive instead of a networked drive. Even with DropBox or rsync or Microsoft SkyDrive instead of a networked drive. If you are
these tools there is danger of data corruption/loss, so only do this if you are using a file-syncing tool it is **essential** that you make sure that both
willing to live with that risk. In particular, be aware that **Google Drive** |app| and the file syncing tool do not try to access the |app| library at the
is incompatible with |app|, if you put your |app| library in Google Drive, you same time. In other words, **do not** run the file syncing tool and |app| at
*will* suffer data loss. See the same time.
`this thread <http://www.mobileread.com/forums/showthread.php?t=205581>`_ for details.
Even with these tools there is danger of data corruption/loss, so only do this
if you are willing to live with that risk. In particular, be aware that
**Google Drive** is incompatible with |app|, if you put your |app| library in
Google Drive, **you will suffer data loss**. See `this thread
<http://www.mobileread.com/forums/showthread.php?t=205581>`_ for details.
Content From The Web Content From The Web
--------------------- ---------------------
@ -797,6 +802,12 @@ Downloading from the Internet can sometimes result in a corrupted download. If t
* Try temporarily disabling your antivirus program (Microsoft Security Essentials, or Kaspersky or Norton or McAfee or whatever). This is most likely the culprit if the upgrade process is hanging in the middle. * Try temporarily disabling your antivirus program (Microsoft Security Essentials, or Kaspersky or Norton or McAfee or whatever). This is most likely the culprit if the upgrade process is hanging in the middle.
* Try rebooting your computer and running a registry cleaner like `Wise registry cleaner <http://www.wisecleaner.com>`_. * Try rebooting your computer and running a registry cleaner like `Wise registry cleaner <http://www.wisecleaner.com>`_.
* Try downloading the installer with an alternate browser. For example if you are using Internet Explorer, try using Firefox or Chrome instead. * Try downloading the installer with an alternate browser. For example if you are using Internet Explorer, try using Firefox or Chrome instead.
* If you get an error about a missing DLL on windows, then most likely, the
permissions on your temporary folder are incorrect. Go to the folder
:file:`C:\\Users\\USERNAME\\AppData\\Local` in Windows explorer and then
right click on the :file:`Temp` folder and select :guilabel:`Properties` and go to
the :guilabel:`Security` tab. Make sure that your user account has full control
for this folder.
If you still cannot get the installer to work and you are on windows, you can use the `calibre portable install <http://calibre-ebook.com/download_portable>`_, which does not need an installer (it is just a zip file). If you still cannot get the installer to work and you are on windows, you can use the `calibre portable install <http://calibre-ebook.com/download_portable>`_, which does not need an installer (it is just a zip file).

View File

@ -368,6 +368,8 @@ For example::
date:>10daysago date:>10daysago
date:<=45daysago date:<=45daysago
To avoid potential problems with translated strings when using a non-English version of calibre, the strings ``_today``, ``_yesterday``, ``_thismonth``, and ``_daysago`` are always available. They are not translated.
You can search for books that have a format of a certain size like this:: You can search for books that have a format of a certain size like this::
size:>1.1M Will find books with a format larger than 1.1MB size:>1.1M Will find books with a format larger than 1.1MB

View File

@ -91,7 +91,11 @@ First, we have to create a WSGI *adapter* for the calibre content server. Here i
# Path to the calibre library to be served # Path to the calibre library to be served
# The server process must have write permission for all files/dirs # The server process must have write permission for all files/dirs
# in this directory or BAD things will happen # in this directory or BAD things will happen
path_to_library='/home/kovid/documents/demo library' path_to_library='/home/kovid/documents/demo library',
# The virtual library (restriction) to be used when serving this
# library.
virtual_library=None
) )
del create_wsgi_app del create_wsgi_app

View File

@ -9,14 +9,14 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
__author__ = 'Dave Asbury' __author__ = 'Dave Asbury'
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg' cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 12 max_articles_per_feed = 20
linearize_tables = True linearize_tables = True
remove_empty_feeds = True remove_empty_feeds = True
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
auto_cleanup = True auto_cleanup = True
language = 'en_GB' language = 'en_GB'
compress_news_images = True
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg' cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
masthead_url = 'http://www.trinitymirror.com/images/birminghampost-logo.gif' masthead_url = 'http://www.trinitymirror.com/images/birminghampost-logo.gif'

View File

@ -1,3 +1,4 @@
import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import OrderedDict from collections import OrderedDict
@ -39,7 +40,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
title=self.tag_to_string(div.a).strip() title=self.tag_to_string(div.a).strip()
url=div.a['href'] url=div.a['href']
soup0 = self.index_to_soup(url) soup0 = self.index_to_soup(url)
urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href'] urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''}) articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''})
@ -56,7 +57,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
title=self.tag_to_string(div.a).strip() title=self.tag_to_string(div.a).strip()
url=div.a['href'] url=div.a['href']
soup0 = self.index_to_soup(url) soup0 = self.index_to_soup(url)
urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href'] urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''}) articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''})
if articles: if articles:

View File

@ -7,13 +7,14 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
#cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg' #cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
__author__ = 'Dave Asbury' __author__ = 'Dave Asbury'
description = 'The official website of Countryfile Magazine' description = 'The official website of Countryfile Magazine'
# last updated 8/12/12 # last updated 19/10/12
language = 'en_GB' language = 'en_GB'
oldest_article = 30 oldest_article = 30
max_articles_per_feed = 25 max_articles_per_feed = 25
remove_empty_feeds = True remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
auto_cleanup = True auto_cleanup = True
compress_news_images = True
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
#articles_are_obfuscated = True #articles_are_obfuscated = True
#article_already_exists = False #article_already_exists = False

View File

@ -13,9 +13,9 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif' masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
compress_news_images = True
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 1 max_articles_per_feed = 12
remove_empty_feeds = True remove_empty_feeds = True
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True

View File

@ -0,0 +1,23 @@
# vim:fileencoding=UTF-8
from __future__ import unicode_literals
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1365070687(BasicNewsRecipe):
title ='Diário de Notícias'
oldest_article = 7
language = 'pt'
__author__ = 'Jose Pinto'
max_articles_per_feed = 100
keep_only_tags = [dict(name='div', attrs={'id':'cln-esqmid'}) ]
remove_tags = [ dict(name='table', attrs={'class':'TabFerramentasInf'}) ]
feeds = [(u'Portugal', u'http://feeds.dn.pt/DN-Portugal'),
(u'Globo', u'http://feeds.dn.pt/DN-Globo'),
(u'Economia', u'http://feeds.dn.pt/DN-Economia'),
(u'Ci\xeancia', u'http://feeds.dn.pt/DN-Ciencia'),
(u'Artes', u'http://feeds.dn.pt/DN-Artes'),
(u'TV & Media', u'http://feeds.dn.pt/DN-Media'),
(u'Opini\xe3o', u'http://feeds.dn.pt/DN-Opiniao'),
(u'Pessoas', u'http://feeds.dn.pt/DN-Pessoas')
]

View File

@ -16,7 +16,7 @@ class EcoGeek(BasicNewsRecipe):
language = 'en' language = 'en'
category = 'news, ecology, blog' category = 'news, ecology, blog'
oldest_article = 7 oldest_article = 30
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = True use_embedded_content = True
@ -28,5 +28,5 @@ class EcoGeek(BasicNewsRecipe):
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Posts', u'http://feeds2.feedburner.com/EcoGeek')] feeds = [(u'Posts', u'http://feeds2.feedburner.com/EcoGeek')]

17
recipes/economia.recipe Normal file
View File

@ -0,0 +1,17 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1314326622(BasicNewsRecipe):
title = u'Economia'
__author__ = 'Manish Bhattarai'
description = 'Economia - Intelligence & Insight for ICAEW Members'
language = 'en_GB'
oldest_article = 7
max_articles_per_feed = 25
masthead_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
cover_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
no_stylesheets = True
remove_empty_feeds = True
remove_tags_before = dict(id='content')
remove_tags_after = dict(id='stars-wrapper')
remove_tags = [dict(attrs={'class':['floatR', 'sharethis', 'rating clearfix']})]
feeds = [(u'News', u'http://feedity.com/icaew-com/VlNTVFRa.rss'),(u'Business', u'http://feedity.com/icaew-com/VlNTVFtS.rss'),(u'People', u'http://feedity.com/icaew-com/VlNTVFtX.rss'),(u'Opinion', u'http://feedity.com/icaew-com/VlNTVFtW.rss'),(u'Finance', u'http://feedity.com/icaew-com/VlNTVFtV.rss')]

View File

@ -12,12 +12,6 @@ class EsensjaRSS(BasicNewsRecipe):
language = 'pl' language = 'pl'
encoding = 'utf-8' encoding = 'utf-8'
INDEX = 'http://www.esensja.pl' INDEX = 'http://www.esensja.pl'
extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left}
.t-author {font-size: x-small; text-align: left}
.t-title2 {font-size: x-small; font-style: italic; text-align: left}
.text {font-size: small; text-align: left}
.annot-ref {font-style: italic; text-align: left}
'''
cover_url = '' cover_url = ''
masthead_url = 'http://esensja.pl/img/wrss.gif' masthead_url = 'http://esensja.pl/img/wrss.gif'
use_embedded_content = False use_embedded_content = False

View File

@ -1,13 +1,14 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010-2012, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2010-2013, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.ft.com/uk-edition www.ft.com/intl/uk-edition
''' '''
import datetime import datetime
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from collections import OrderedDict
class FinancialTimes(BasicNewsRecipe): class FinancialTimes(BasicNewsRecipe):
title = 'Financial Times (UK)' title = 'Financial Times (UK)'
@ -28,7 +29,7 @@ class FinancialTimes(BasicNewsRecipe):
masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg'
LOGIN = 'https://registration.ft.com/registration/barrier/login' LOGIN = 'https://registration.ft.com/registration/barrier/login'
LOGIN2 = 'http://media.ft.com/h/subs3.html' LOGIN2 = 'http://media.ft.com/h/subs3.html'
INDEX = 'http://www.ft.com/uk-edition' INDEX = 'http://www.ft.com/intl/uk-edition'
PREFIX = 'http://www.ft.com' PREFIX = 'http://www.ft.com'
conversion_options = { conversion_options = {
@ -105,29 +106,30 @@ class FinancialTimes(BasicNewsRecipe):
return articles return articles
def parse_index(self): def parse_index(self):
feeds = [] feeds = OrderedDict()
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div')) #dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
self.timefmt = ' [%s]'%dates #self.timefmt = ' [%s]'%dates
wide = soup.find('div',attrs={'class':'wide'}) section_title = 'Untitled'
if not wide:
return feeds for column in soup.findAll('div', attrs = {'class':'feedBoxes clearfix'}):
allsections = wide.findAll(attrs={'class':lambda x: x and 'footwell' in x.split()}) for section in column. findAll('div', attrs = {'class':'feedBox'}):
if not allsections: sectiontitle=self.tag_to_string(section.find('h4'))
return feeds if '...' not in sectiontitle: section_title=sectiontitle
count = 0 for article in section.ul.findAll('li'):
for item in allsections: articles = []
count = count + 1 title=self.tag_to_string(article.a)
if self.test and count > 2: url=article.a['href']
return feeds articles.append({'title':title, 'url':url, 'description':'', 'date':''})
fitem = item.h3
if not fitem: if articles:
fitem = item.h4 if section_title not in feeds:
ftitle = self.tag_to_string(fitem) feeds[section_title] = []
self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) feeds[section_title] += articles
feedarts = self.get_artlinks(item.ul)
feeds.append((ftitle,feedarts))
return feeds ans = [(key, val) for key, val in feeds.iteritems()]
return ans
def preprocess_html(self, soup): def preprocess_html(self, soup):
items = ['promo-box','promo-title', items = ['promo-box','promo-title',
@ -177,6 +179,3 @@ class FinancialTimes(BasicNewsRecipe):
tfile.close() tfile.close()
self.temp_files.append(tfile) self.temp_files.append(tfile)
return tfile.name return tfile.name
def cleanup(self):
self.browser.open('https://registration.ft.com/registration/login/logout?location=')

View File

@ -1,20 +1,21 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2010-2013, Darko Miletic <darko.miletic at gmail.com>'
''' '''
http://www.ft.com/intl/us-edition www.ft.com/intl/international-edition
''' '''
import datetime import datetime
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from collections import OrderedDict
class FinancialTimes(BasicNewsRecipe): class FinancialTimes(BasicNewsRecipe):
title = 'Financial Times (US) printed edition' title = 'Financial Times (International) printed edition'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy." description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy."
publisher = 'The Financial Times Ltd.' publisher = 'The Financial Times Ltd.'
category = 'news, finances, politics, UK, World' category = 'news, finances, politics, World'
oldest_article = 2 oldest_article = 2
language = 'en' language = 'en'
max_articles_per_feed = 250 max_articles_per_feed = 250
@ -28,7 +29,7 @@ class FinancialTimes(BasicNewsRecipe):
masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg'
LOGIN = 'https://registration.ft.com/registration/barrier/login' LOGIN = 'https://registration.ft.com/registration/barrier/login'
LOGIN2 = 'http://media.ft.com/h/subs3.html' LOGIN2 = 'http://media.ft.com/h/subs3.html'
INDEX = 'http://www.ft.com/intl/us-edition' INDEX = 'http://www.ft.com/intl/international-edition'
PREFIX = 'http://www.ft.com' PREFIX = 'http://www.ft.com'
conversion_options = { conversion_options = {
@ -105,29 +106,30 @@ class FinancialTimes(BasicNewsRecipe):
return articles return articles
def parse_index(self): def parse_index(self):
feeds = [] feeds = OrderedDict()
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div')) #dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
self.timefmt = ' [%s]'%dates #self.timefmt = ' [%s]'%dates
wide = soup.find('div',attrs={'class':'wide'}) section_title = 'Untitled'
if not wide:
return feeds for column in soup.findAll('div', attrs = {'class':'feedBoxes clearfix'}):
allsections = wide.findAll(attrs={'class':lambda x: x and 'footwell' in x.split()}) for section in column. findAll('div', attrs = {'class':'feedBox'}):
if not allsections: sectiontitle=self.tag_to_string(section.find('h4'))
return feeds if '...' not in sectiontitle: section_title=sectiontitle
count = 0 for article in section.ul.findAll('li'):
for item in allsections: articles = []
count = count + 1 title=self.tag_to_string(article.a)
if self.test and count > 2: url=article.a['href']
return feeds articles.append({'title':title, 'url':url, 'description':'', 'date':''})
fitem = item.h3
if not fitem: if articles:
fitem = item.h4 if section_title not in feeds:
ftitle = self.tag_to_string(fitem) feeds[section_title] = []
self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) feeds[section_title] += articles
feedarts = self.get_artlinks(item.ul)
feeds.append((ftitle,feedarts))
return feeds ans = [(key, val) for key, val in feeds.iteritems()]
return ans
def preprocess_html(self, soup): def preprocess_html(self, soup):
items = ['promo-box','promo-title', items = ['promo-box','promo-title',
@ -177,6 +179,3 @@ class FinancialTimes(BasicNewsRecipe):
tfile.close() tfile.close()
self.temp_files.append(tfile) self.temp_files.append(tfile)
return tfile.name return tfile.name
def cleanup(self):
self.browser.open('https://registration.ft.com/registration/login/logout?location=')

53
recipes/forbes_pl.recipe Normal file
View File

@ -0,0 +1,53 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
import datetime
import re
class forbes_pl(BasicNewsRecipe):
title = u'Forbes.pl'
__author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
language = 'pl'
description = u'Biznes, finanse, gospodarka, strategie, wiadomości gospodarcze, analizy finasowe i strategiczne.'
oldest_article = 1
index = 'http://www.forbes.pl'
cover_url = 'http://www.forbes.pl/resources/front/images/logo.png'
max_articles_per_feed = 100
extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}'
preprocess_regexps = [(re.compile(ur'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')]
remove_javascript = True
no_stylesheets = True
now = datetime.datetime.now()
yesterday = now - datetime.timedelta(hours=24)
yesterday = yesterday.strftime("%d.%m.%Y %H:%M:%S")
pages_count = 4
keep_only_tags = [dict(attrs={'class':['Block-Node Content-Article ', 'Block-Node Content-Article piano-closed']})]
remove_tags = [dict(attrs={'class':['Keywords Styled', 'twitter-share-button', 'Block-List-Related Block-List']})]
feeds = [(u'Wszystkie', 'http://www.forbes.pl/rss')]
'''def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup
def append_page(self, soup, appendtag):
cleanup = False
nexturl = appendtag.find('a', attrs={'class':'next'})
if nexturl:
cleanup = True
while nexturl:
soup2 = self.index_to_soup(self.index + nexturl['href'])
nexturl = soup2.find('a', attrs={'class':'next'})
pagetext = soup2.findAll(id='article-body-wrapper')
if not pagetext:
pagetext = soup2.findAll(attrs={'class':'Article-Entry Styled'})
for comment in pagetext.findAll(text=lambda text:isinstance(text, Comment)):
comment.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
if cleanup:
for r in appendtag.findAll(attrs={'class':'paginator'}):
r.extract()'''

View File

@ -6,6 +6,7 @@ __copyright__ = u'2010-2013, Tomasz Dlugosz <tomek3d@gmail.com>'
fronda.pl fronda.pl
''' '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import timedelta, date from datetime import timedelta, date
@ -23,6 +24,7 @@ class Fronda(BasicNewsRecipe):
extra_css = ''' extra_css = '''
h1 {font-size:150%} h1 {font-size:150%}
.body {text-align:left;} .body {text-align:left;}
div#featured-image {font-style:italic; font-size:70%}
''' '''
earliest_date = date.today() - timedelta(days=oldest_article) earliest_date = date.today() - timedelta(days=oldest_article)
@ -55,7 +57,10 @@ class Fronda(BasicNewsRecipe):
articles = {} articles = {}
for url, genName in genres: for url, genName in genres:
try:
soup = self.index_to_soup('http://www.fronda.pl/c/'+ url) soup = self.index_to_soup('http://www.fronda.pl/c/'+ url)
except:
continue
articles[genName] = [] articles[genName] = []
for item in soup.findAll('li'): for item in soup.findAll('li'):
article_h = item.find('h2') article_h = item.find('h2')
@ -77,16 +82,15 @@ class Fronda(BasicNewsRecipe):
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':['related-articles', dict(name='div', attrs={'class':['related-articles','button right','pagination','related-articles content']}),
'button right',
'pagination']}),
dict(name='h3', attrs={'class':'block-header article comments'}), dict(name='h3', attrs={'class':'block-header article comments'}),
dict(name='ul', attrs={'class':'comment-list'}), dict(name='ul', attrs={'class':['comment-list','category','tag-list']}),
dict(name='ul', attrs={'class':'category'}),
dict(name='ul', attrs={'class':'tag-list'}),
dict(name='p', attrs={'id':'comments-disclaimer'}), dict(name='p', attrs={'id':'comments-disclaimer'}),
dict(name='div', attrs={'style':'text-align: left; margin-bottom: 15px;'}), dict(name='div', attrs={'style':'text-align: left; margin-bottom: 15px;'}),
dict(name='div', attrs={'style':'text-align: left; margin-top: 15px; margin-bottom: 30px;'}), dict(name='div', attrs={'style':'text-align: left; margin-top: 15px; margin-bottom: 30px;'}),
dict(name='div', attrs={'class':'related-articles content'}), dict(name='div', attrs={'id':'comment-form'}),
dict(name='div', attrs={'id':'comment-form'}) dict(name='span', attrs={'class':'separator'})
] ]
preprocess_regexps = [
(re.compile(r'komentarzy: .*?</h6>', re.IGNORECASE | re.DOTALL | re.M ), lambda match: '</h6>')]

108
recipes/galaxys_edge.recipe Normal file
View File

@ -0,0 +1,108 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
from calibre.web.feeds.news import BasicNewsRecipe
class GalaxyEdge(BasicNewsRecipe):
title = u'The Galaxy\'s Edge'
language = 'en'
oldest_article = 7
__author__ = 'Krittika Goyal'
no_stylesheets = True
auto_cleanup = True
#keep_only_tags = [dict(id='content')]
#remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
#dict(id=['email-section', 'right-column', 'printfooter', 'topover',
#'slidebox', 'th_footer'])]
extra_css = '.photo-caption { font-size: smaller }'
def parse_index(self):
soup = self.index_to_soup('http://www.galaxysedge.com/')
main = soup.find('table', attrs={'width':'911'})
toc = main.find('td', attrs={'width':'225'})
current_section = None
current_articles = []
feeds = []
c = 0
for x in toc.findAll(['p']):
c = c+1
if c == 5:
if current_articles and current_section:
feeds.append((current_section, current_articles))
edwo = x.find('a')
current_section = self.tag_to_string(edwo)
current_articles = []
self.log('\tFound section:', current_section)
title = self.tag_to_string(edwo)
url = edwo.get('href', True)
url = 'http://www.galaxysedge.com/'+url
print(title)
print(c)
if not url or not title:
continue
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
current_articles.append({'title': title, 'url':url,
'description':'', 'date':''})
elif c>5:
current_section = self.tag_to_string(x.find('b'))
current_articles = []
self.log('\tFound section:', current_section)
for y in x.findAll('a'):
title = self.tag_to_string(y)
url = y.get('href', True)
url = 'http://www.galaxysedge.com/'+url
print(title)
if not url or not title:
continue
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
current_articles.append({'title': title, 'url':url,
'description':'', 'date':''})
if current_articles and current_section:
feeds.append((current_section, current_articles))
return feeds
#def preprocess_raw_html(self, raw, url):
#return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
#def postprocess_html(self, soup, first_fetch):
#for t in soup.findAll(['table', 'tr', 'td','center']):
#t.name = 'div'
#return soup
#def parse_index(self):
#today = time.strftime('%Y-%m-%d')
#soup = self.index_to_soup(
#'http://www.thehindu.com/todays-paper/tp-index/?date=' + today)
#div = soup.find(id='left-column')
#feeds = []
#current_section = None
#current_articles = []
#for x in div.findAll(['h3', 'div']):
#if current_section and x.get('class', '') == 'tpaper':
#a = x.find('a', href=True)
#if a is not None:
#current_articles.append({'url':a['href']+'?css=print',
#'title':self.tag_to_string(a), 'date': '',
#'description':''})
#if x.name == 'h3':
#if current_section and current_articles:
#feeds.append((current_section, current_articles))
#current_section = self.tag_to_string(x)
#current_articles = []
#return feeds

View File

@ -10,7 +10,7 @@ krakow.gazeta.pl
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class gw_krakow(BasicNewsRecipe): class gw_krakow(BasicNewsRecipe):
title = u'Gazeta.pl Kraków' title = u'Gazeta Wyborcza Kraków'
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks' __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
language = 'pl' language = 'pl'
description =u'Wiadomości z Krakowa na portalu Gazeta.pl.' description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'

View File

@ -5,7 +5,7 @@ import string
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class GazetaPlSzczecin(BasicNewsRecipe): class GazetaPlSzczecin(BasicNewsRecipe):
title = u'Gazeta.pl Szczecin' title = u'Gazeta Wyborcza Szczecin'
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.' description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
__author__ = u'Michał Szkutnik' __author__ = u'Michał Szkutnik'
__license__ = u'GPL v3' __license__ = u'GPL v3'

View File

@ -10,7 +10,7 @@ warszawa.gazeta.pl
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class gw_wawa(BasicNewsRecipe): class gw_wawa(BasicNewsRecipe):
title = u'Gazeta.pl Warszawa' title = u'Gazeta Wyborcza Warszawa'
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks' __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
language = 'pl' language = 'pl'
description ='Wiadomości z Warszawy na portalu Gazeta.pl.' description ='Wiadomości z Warszawy na portalu Gazeta.pl.'

View File

@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Comment from calibre.ebooks.BeautifulSoup import Comment
class Gazeta_Wyborcza(BasicNewsRecipe): class Gazeta_Wyborcza(BasicNewsRecipe):
title = u'Gazeta.pl' title = u'Gazeta Wyborcza'
__author__ = 'fenuks, Artur Stachecki' __author__ = 'fenuks, Artur Stachecki'
language = 'pl' language = 'pl'
description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.' description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'

View File

@ -1,90 +0,0 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GiveMeSomethingToRead(BasicNewsRecipe):
title = u'Give Me Something To Read'
description = 'Curation / aggregation of articles on diverse topics'
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = False
timefmt = ' [%a, %d %b, %Y]'
oldest_article = 365
auto_cleanup = True
INDEX = 'http://givemesomethingtoread.com'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, system name, max number of articles to load)
('The Arts','arts',25),
('Science','science',30),
('Technology','technology',30),
('Politics','politics',20),
('Media','media',30),
('Crime','crime',15),
('Other articles','',10)
]
def parse_index(self):
self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg'
feeds = []
seen_urls = set([])
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
tagurl = '' if tag=='' else '/tagged/'+tag
self.log('Reading category:', cat_name)
articles = []
pageno = 1
while len(articles) < max_articles and pageno < 100:
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
pageno += 1
self.log('\tReading page:', page)
try:
soup = self.index_to_soup(page)
except:
break
headers = soup.findAll('h2')
if len(headers) == .0:
break
for header in headers:
atag = header.find('a')
url = atag['href']
# skip promotionals and duplicate
if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(header)
self.log('\tFound article:', title)
#self.log('\t', url)
desc = header.parent.find('blockquote')
desc = self.tag_to_string(desc) if desc else ''
m = regex.match( url)
if m:
desc = "[%s] %s" % (m.group(2), desc)
#self.log('\t', desc)
date = ''
p = header.parent.previousSibling
# navigate up to find h3, which contains the date
while p:
if hasattr(p,'name') and p.name == 'h3':
date = self.tag_to_string(p)
break
p = p.previousSibling
articles.append({'title':title,'url':url,'description':desc,'date':date})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds

View File

@ -1,448 +1,229 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = 'Copyright 2010 Starson17'
'''
www.gocomics.com
'''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import mechanize, re
class GoComics(BasicNewsRecipe):
title = 'GoComics' class Comics(BasicNewsRecipe):
title = 'Comics.com'
__author__ = 'Starson17' __author__ = 'Starson17'
__version__ = '1.06' description = 'Comics from comics.com. You should customize this recipe to fetch only the comics you are interested in'
__date__ = '07 June 2011'
description = u'200+ Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
category = 'news, comics'
language = 'en' language = 'en'
use_embedded_content= False use_embedded_content= False
no_stylesheets = True no_stylesheets = True
oldest_article = 24
remove_javascript = True remove_javascript = True
cover_url = 'http://paulbuckley14059.files.wordpress.com/2008/06/calvin-and-hobbes.jpg' cover_url = 'http://www.bsb.lib.tx.us/images/comics.com.gif'
remove_attributes = ['style'] recursions = 0
max_articles_per_feed = 10
####### USER PREFERENCES - COMICS, IMAGE SIZE AND NUMBER OF COMICS TO RETRIEVE ########
# num_comics_to_get - I've tried up to 99 on Calvin&Hobbes
num_comics_to_get = 7 num_comics_to_get = 7
# comic_size 300 is small, 600 is medium, 900 is large, 1500 is extra-large simultaneous_downloads = 1
comic_size = 900 # delay = 3
# CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
# Please do not overload their servers by selecting all comics and 1000 strips from each!
conversion_options = {'linearize_tables' : True keep_only_tags = [dict(name='h1'),
, 'comment' : description dict(name='p', attrs={'class':'feature_item'})
, 'tags' : category
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'class':['feature','banner']}),
] ]
remove_tags = [dict(name='a', attrs={'class':['beginning','prev','cal','next','newest']}),
dict(name='div', attrs={'class':['tag-wrapper']}),
dict(name='a', attrs={'href':re.compile(r'.*mutable_[0-9]+', re.IGNORECASE)}),
dict(name='img', attrs={'src':re.compile(r'.*mutable_[0-9]+', re.IGNORECASE)}),
dict(name='ul', attrs={'class':['share-nav','feature-nav']}),
]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
cookies = mechanize.CookieJar()
br = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
br.addheaders = [('Referer','http://www.gocomics.com/')]
return br
def parse_index(self): def parse_index(self):
feeds = [] feeds = []
for title, url in [ for title, url in [
(u"2 Cows and a Chicken", u"http://www.gocomics.com/2cowsandachicken"), ("9 Chickweed Lane", "http://gocomics.com/9_chickweed_lane"),
#(u"9 Chickweed Lane", u"http://www.gocomics.com/9chickweedlane"), ("Agnes", "http://gocomics.com/agnes"),
(u"9 to 5", u"http://www.gocomics.com/9to5"), ("Alley Oop", "http://gocomics.com/alley_oop"),
#(u"Adam At Home", u"http://www.gocomics.com/adamathome"), ("Andy Capp", "http://gocomics.com/andy_capp"),
(u"Agnes", u"http://www.gocomics.com/agnes"), ("Arlo & Janis", "http://gocomics.com/arlo&janis"),
#(u"Alley Oop", u"http://www.gocomics.com/alleyoop"), ("B.C.", "http://gocomics.com/bc"),
#(u"Andy Capp", u"http://www.gocomics.com/andycapp"), ("Ballard Street", "http://gocomics.com/ballard_street"),
#(u"Animal Crackers", u"http://www.gocomics.com/animalcrackers"), # ("Ben", "http://comics.com/ben"),
#(u"Annie", u"http://www.gocomics.com/annie"), # ("Betty", "http://comics.com/betty"),
#(u"Arlo & Janis", u"http://www.gocomics.com/arloandjanis"), # ("Big Nate", "http://comics.com/big_nate"),
#(u"Ask Shagg", u"http://www.gocomics.com/askshagg"), # ("Brevity", "http://comics.com/brevity"),
(u"B.C.", u"http://www.gocomics.com/bc"), # ("Candorville", "http://comics.com/candorville"),
#(u"Back in the Day", u"http://www.gocomics.com/backintheday"), # ("Cheap Thrills", "http://comics.com/cheap_thrills"),
#(u"Bad Reporter", u"http://www.gocomics.com/badreporter"), # ("Committed", "http://comics.com/committed"),
#(u"Baldo", u"http://www.gocomics.com/baldo"), # ("Cow & Boy", "http://comics.com/cow&boy"),
#(u"Ballard Street", u"http://www.gocomics.com/ballardstreet"), # ("Daddy's Home", "http://comics.com/daddys_home"),
#(u"Barkeater Lake", u"http://www.gocomics.com/barkeaterlake"), # ("Dog eat Doug", "http://comics.com/dog_eat_doug"),
#(u"Basic Instructions", u"http://www.gocomics.com/basicinstructions"), # ("Drabble", "http://comics.com/drabble"),
#(u"Ben", u"http://www.gocomics.com/ben"), # ("F Minus", "http://comics.com/f_minus"),
#(u"Betty", u"http://www.gocomics.com/betty"), # ("Family Tree", "http://comics.com/family_tree"),
#(u"Bewley", u"http://www.gocomics.com/bewley"), # ("Farcus", "http://comics.com/farcus"),
#(u"Big Nate", u"http://www.gocomics.com/bignate"), # ("Fat Cats Classics", "http://comics.com/fat_cats_classics"),
#(u"Big Top", u"http://www.gocomics.com/bigtop"), # ("Ferd'nand", "http://comics.com/ferdnand"),
#(u"Biographic", u"http://www.gocomics.com/biographic"), # ("Flight Deck", "http://comics.com/flight_deck"),
#(u"Birdbrains", u"http://www.gocomics.com/birdbrains"), # ("Flo & Friends", "http://comics.com/flo&friends"),
#(u"Bleeker: The Rechargeable Dog", u"http://www.gocomics.com/bleeker"), # ("Fort Knox", "http://comics.com/fort_knox"),
#(u"Bliss", u"http://www.gocomics.com/bliss"), # ("Frank & Ernest", "http://comics.com/frank&ernest"),
(u"Bloom County", u"http://www.gocomics.com/bloomcounty"), # ("Frazz", "http://comics.com/frazz"),
#(u"Bo Nanas", u"http://www.gocomics.com/bonanas"), # ("Free Range", "http://comics.com/free_range"),
#(u"Bob the Squirrel", u"http://www.gocomics.com/bobthesquirrel"), # ("Geech Classics", "http://comics.com/geech_classics"),
#(u"Boomerangs", u"http://www.gocomics.com/boomerangs"), # ("Get Fuzzy", "http://comics.com/get_fuzzy"),
#(u"Bottomliners", u"http://www.gocomics.com/bottomliners"), # ("Girls & Sports", "http://comics.com/girls&sports"),
#(u"Bound and Gagged", u"http://www.gocomics.com/boundandgagged"), # ("Graffiti", "http://comics.com/graffiti"),
#(u"Brainwaves", u"http://www.gocomics.com/brainwaves"), # ("Grand Avenue", "http://comics.com/grand_avenue"),
#(u"Brenda Starr", u"http://www.gocomics.com/brendastarr"), # ("Heathcliff", "http://comics.com/heathcliff"),
#(u"Brevity", u"http://www.gocomics.com/brevity"), # "Heathcliff, a street-smart and mischievous cat with many adventures."
#(u"Brewster Rockit", u"http://www.gocomics.com/brewsterrockit"), # ("Herb and Jamaal", "http://comics.com/herb_and_jamaal"),
#(u"Broom Hilda", u"http://www.gocomics.com/broomhilda"), # ("Herman", "http://comics.com/herman"),
(u"Calvin and Hobbes", u"http://www.gocomics.com/calvinandhobbes"), # ("Home and Away", "http://comics.com/home_and_away"),
#(u"Candorville", u"http://www.gocomics.com/candorville"), # ("It's All About You", "http://comics.com/its_all_about_you"),
#(u"Cathy", u"http://www.gocomics.com/cathy"), # ("Jane's World", "http://comics.com/janes_world"),
#(u"C'est la Vie", u"http://www.gocomics.com/cestlavie"), # ("Jump Start", "http://comics.com/jump_start"),
#(u"Cheap Thrills", u"http://www.gocomics.com/cheapthrills"), # ("Kit 'N' Carlyle", "http://comics.com/kit_n_carlyle"),
#(u"Chuckle Bros", u"http://www.gocomics.com/chucklebros"), # ("Li'l Abner Classics", "http://comics.com/lil_abner_classics"),
#(u"Citizen Dog", u"http://www.gocomics.com/citizendog"), # ("Liberty Meadows", "http://comics.com/liberty_meadows"),
#(u"Cleats", u"http://www.gocomics.com/cleats"), # ("Little Dog Lost", "http://comics.com/little_dog_lost"),
#(u"Close to Home", u"http://www.gocomics.com/closetohome"), # ("Lola", "http://comics.com/lola"),
#(u"Committed", u"http://www.gocomics.com/committed"), # ("Luann", "http://comics.com/luann"),
#(u"Compu-toon", u"http://www.gocomics.com/compu-toon"), # ("Marmaduke", "http://comics.com/marmaduke"),
#(u"Cornered", u"http://www.gocomics.com/cornered"), # ("Meg! Classics", "http://comics.com/meg_classics"),
#(u"Cow & Boy", u"http://www.gocomics.com/cow&boy"), # ("Minimum Security", "http://comics.com/minimum_security"),
#(u"Cul de Sac", u"http://www.gocomics.com/culdesac"), # ("Moderately Confused", "http://comics.com/moderately_confused"),
#(u"Daddy's Home", u"http://www.gocomics.com/daddyshome"), # ("Momma", "http://comics.com/momma"),
#(u"Deep Cover", u"http://www.gocomics.com/deepcover"), # ("Monty", "http://comics.com/monty"),
#(u"Dick Tracy", u"http://www.gocomics.com/dicktracy"), # ("Motley Classics", "http://comics.com/motley_classics"),
(u"Dog Eat Doug", u"http://www.gocomics.com/dogeatdoug"), # ("Nancy", "http://comics.com/nancy"),
#(u"Domestic Abuse", u"http://www.gocomics.com/domesticabuse"), # ("Natural Selection", "http://comics.com/natural_selection"),
(u"Doodles", u"http://www.gocomics.com/doodles"), # ("Nest Heads", "http://comics.com/nest_heads"),
(u"Doonesbury", u"http://www.gocomics.com/doonesbury"), # ("Off The Mark", "http://comics.com/off_the_mark"),
#(u"Drabble", u"http://www.gocomics.com/drabble"), # ("On a Claire Day", "http://comics.com/on_a_claire_day"),
#(u"Eek!", u"http://www.gocomics.com/eek"), # ("One Big Happy Classics", "http://comics.com/one_big_happy_classics"),
#(u"F Minus", u"http://www.gocomics.com/fminus"), # ("Over the Hedge", "http://comics.com/over_the_hedge"),
#(u"Family Tree", u"http://www.gocomics.com/familytree"), # ("PC and Pixel", "http://comics.com/pc_and_pixel"),
#(u"Farcus", u"http://www.gocomics.com/farcus"), # ("Peanuts", "http://comics.com/peanuts"),
(u"Fat Cats Classics", u"http://www.gocomics.com/fatcatsclassics"), # ("Pearls Before Swine", "http://comics.com/pearls_before_swine"),
#(u"Ferd'nand", u"http://www.gocomics.com/ferdnand"), # ("Pickles", "http://comics.com/pickles"),
#(u"Flight Deck", u"http://www.gocomics.com/flightdeck"), # ("Prickly City", "http://comics.com/prickly_city"),
(u"Flo and Friends", u"http://www.gocomics.com/floandfriends"), # ("Raising Duncan Classics", "http://comics.com/raising_duncan_classics"),
#(u"For Better or For Worse", u"http://www.gocomics.com/forbetterorforworse"), # ("Reality Check", "http://comics.com/reality_check"),
#(u"For Heaven's Sake", u"http://www.gocomics.com/forheavenssake"), # ("Red & Rover", "http://comics.com/red&rover"),
#(u"Fort Knox", u"http://www.gocomics.com/fortknox"), # ("Rip Haywire", "http://comics.com/rip_haywire"),
#(u"FoxTrot Classics", u"http://www.gocomics.com/foxtrotclassics"), # ("Ripley's Believe It or Not!", "http://comics.com/ripleys_believe_it_or_not"),
(u"FoxTrot", u"http://www.gocomics.com/foxtrot"), # ("Rose Is Rose", "http://comics.com/rose_is_rose"),
#(u"Frank & Ernest", u"http://www.gocomics.com/frankandernest"), # ("Rubes", "http://comics.com/rubes"),
#(u"Frazz", u"http://www.gocomics.com/frazz"), # ("Rudy Park", "http://comics.com/rudy_park"),
#(u"Fred Basset", u"http://www.gocomics.com/fredbasset"), # ("Scary Gary", "http://comics.com/scary_gary"),
#(u"Free Range", u"http://www.gocomics.com/freerange"), # ("Shirley and Son Classics", "http://comics.com/shirley_and_son_classics"),
#(u"Frog Applause", u"http://www.gocomics.com/frogapplause"), # ("Soup To Nutz", "http://comics.com/soup_to_nutz"),
#(u"Garfield Minus Garfield", u"http://www.gocomics.com/garfieldminusgarfield"), # ("Speed Bump", "http://comics.com/speed_bump"),
(u"Garfield", u"http://www.gocomics.com/garfield"), # ("Spot The Frog", "http://comics.com/spot_the_frog"),
#(u"Gasoline Alley", u"http://www.gocomics.com/gasolinealley"), # ("State of the Union", "http://comics.com/state_of_the_union"),
#(u"Geech Classics", u"http://www.gocomics.com/geechclassics"), # ("Strange Brew", "http://comics.com/strange_brew"),
#(u"Get Fuzzy", u"http://www.gocomics.com/getfuzzy"), # ("Tarzan Classics", "http://comics.com/tarzan_classics"),
#(u"Gil Thorp", u"http://www.gocomics.com/gilthorp"), # ("That's Life", "http://comics.com/thats_life"),
#(u"Ginger Meggs", u"http://www.gocomics.com/gingermeggs"), # ("The Barn", "http://comics.com/the_barn"),
#(u"Girls & Sports", u"http://www.gocomics.com/girlsandsports"), # ("The Born Loser", "http://comics.com/the_born_loser"),
#(u"Graffiti", u"http://www.gocomics.com/graffiti"), # ("The Buckets", "http://comics.com/the_buckets"),
#(u"Grand Avenue", u"http://www.gocomics.com/grandavenue"), # ("The Dinette Set", "http://comics.com/the_dinette_set"),
#(u"Haiku Ewe", u"http://www.gocomics.com/haikuewe"), # ("The Grizzwells", "http://comics.com/the_grizzwells"),
#(u"Heart of the City", u"http://www.gocomics.com/heartofthecity"), # ("The Humble Stumble", "http://comics.com/the_humble_stumble"),
(u"Heathcliff", u"http://www.gocomics.com/heathcliff"), # ("The Knight Life", "http://comics.com/the_knight_life"),
#(u"Herb and Jamaal", u"http://www.gocomics.com/herbandjamaal"), # ("The Meaning of Lila", "http://comics.com/the_meaning_of_lila"),
#(u"Herman", u"http://www.gocomics.com/herman"), # ("The Other Coast", "http://comics.com/the_other_coast"),
#(u"Home and Away", u"http://www.gocomics.com/homeandaway"), # ("The Sunshine Club", "http://comics.com/the_sunshine_club"),
#(u"Housebroken", u"http://www.gocomics.com/housebroken"), # ("Unstrange Phenomena", "http://comics.com/unstrange_phenomena"),
#(u"Hubert and Abby", u"http://www.gocomics.com/hubertandabby"), # ("Watch Your Head", "http://comics.com/watch_your_head"),
#(u"Imagine This", u"http://www.gocomics.com/imaginethis"), # ("Wizard of Id", "http://comics.com/wizard_of_id"),
#(u"In the Bleachers", u"http://www.gocomics.com/inthebleachers"), # ("Working Daze", "http://comics.com/working_daze"),
#(u"In the Sticks", u"http://www.gocomics.com/inthesticks"), # ("Working It Out", "http://comics.com/working_it_out"),
#(u"Ink Pen", u"http://www.gocomics.com/inkpen"), # ("Zack Hill", "http://comics.com/zack_hill"),
#(u"It's All About You", u"http://www.gocomics.com/itsallaboutyou"), # ("(Th)ink", "http://comics.com/think"),
#(u"Jane's World", u"http://www.gocomics.com/janesworld"), # "Tackling the political and social issues impacting communities of color."
#(u"Joe Vanilla", u"http://www.gocomics.com/joevanilla"), # ("Adam Zyglis", "http://comics.com/adam_zyglis"),
#(u"Jump Start", u"http://www.gocomics.com/jumpstart"), # "Known for his excellent caricatures, as well as independent and incisive imagery. "
#(u"Kit 'N' Carlyle", u"http://www.gocomics.com/kitandcarlyle"), # ("Andy Singer", "http://comics.com/andy_singer"),
#(u"La Cucaracha", u"http://www.gocomics.com/lacucaracha"), # ("Bill Day", "http://comics.com/bill_day"),
#(u"Last Kiss", u"http://www.gocomics.com/lastkiss"), # "Powerful images on sensitive issues."
#(u"Legend of Bill", u"http://www.gocomics.com/legendofbill"), # ("Bill Schorr", "http://comics.com/bill_schorr"),
#(u"Liberty Meadows", u"http://www.gocomics.com/libertymeadows"), # ("Bob Englehart", "http://comics.com/bob_englehart"),
#(u"Li'l Abner Classics", u"http://www.gocomics.com/lilabnerclassics"), # ("Brian Fairrington", "http://comics.com/brian_fairrington"),
#(u"Lio", u"http://www.gocomics.com/lio"), # ("Bruce Beattie", "http://comics.com/bruce_beattie"),
#(u"Little Dog Lost", u"http://www.gocomics.com/littledoglost"), # ("Cam Cardow", "http://comics.com/cam_cardow"),
#(u"Little Otto", u"http://www.gocomics.com/littleotto"), # ("Chip Bok", "http://comics.com/chip_bok"),
#(u"Lola", u"http://www.gocomics.com/lola"), # ("Chris Britt", "http://comics.com/chris_britt"),
#(u"Loose Parts", u"http://www.gocomics.com/looseparts"), # ("Chuck Asay", "http://comics.com/chuck_asay"),
#(u"Love Is...", u"http://www.gocomics.com/loveis"), # ("Clay Bennett", "http://comics.com/clay_bennett"),
#(u"Luann", u"http://www.gocomics.com/luann"), # ("Daryl Cagle", "http://comics.com/daryl_cagle"),
#(u"Maintaining", u"http://www.gocomics.com/maintaining"), # ("David Fitzsimmons", "http://comics.com/david_fitzsimmons"),
(u"Marmaduke", u"http://www.gocomics.com/marmaduke"), # "David Fitzsimmons is a new editorial cartoons on comics.com. He is also a staff writer and editorial cartoonist for the Arizona Daily Star. "
#(u"Meg! Classics", u"http://www.gocomics.com/megclassics"), # ("Drew Litton", "http://comics.com/drew_litton"),
#(u"Middle-Aged White Guy", u"http://www.gocomics.com/middleagedwhiteguy"), # "Drew Litton is an artist who is probably best known for his sports cartoons. He received the National Cartoonist Society Sports Cartoon Award for 1993. "
#(u"Minimum Security", u"http://www.gocomics.com/minimumsecurity"), # ("Ed Stein", "http://comics.com/ed_stein"),
#(u"Moderately Confused", u"http://www.gocomics.com/moderatelyconfused"), # "Winner of the Fischetti Award in 2006 and the Scripps Howard National Journalism Award, 1999, Ed Stein has been the editorial cartoonist for the Rocky Mountain News since 1978. "
(u"Momma", u"http://www.gocomics.com/momma"), # ("Eric Allie", "http://comics.com/eric_allie"),
#(u"Monty", u"http://www.gocomics.com/monty"), # "Eric Allie is an editorial cartoonist with the Pioneer Press and CNS News. "
#(u"Motley Classics", u"http://www.gocomics.com/motleyclassics"), # ("Gary Markstein", "http://comics.com/gary_markstein"),
(u"Mutt & Jeff", u"http://www.gocomics.com/muttandjeff"), # ("Gary McCoy", "http://comics.com/gary_mccoy"),
#(u"Mythtickle", u"http://www.gocomics.com/mythtickle"), # "Gary McCoy is known for his editorial cartoons, humor and inane ramblings. He is a 2 time nominee for Best Magazine Cartoonist of the Year by the National Cartoonists Society. He resides in Belleville, IL. "
#(u"Nancy", u"http://www.gocomics.com/nancy"), # ("Gary Varvel", "http://comics.com/gary_varvel"),
#(u"Natural Selection", u"http://www.gocomics.com/naturalselection"), # ("Henry Payne", "http://comics.com/henry_payne"),
#(u"Nest Heads", u"http://www.gocomics.com/nestheads"), # ("JD Crowe", "http://comics.com/jd_crowe"),
#(u"NEUROTICA", u"http://www.gocomics.com/neurotica"), # ("Jeff Parker", "http://comics.com/jeff_parker"),
#(u"New Adventures of Queen Victoria", u"http://www.gocomics.com/thenewadventuresofqueenvictoria"), # ("Jeff Stahler", "http://comics.com/jeff_stahler"),
#(u"Non Sequitur", u"http://www.gocomics.com/nonsequitur"), # ("Jerry Holbert", "http://comics.com/jerry_holbert"),
#(u"Off The Mark", u"http://www.gocomics.com/offthemark"), # ("John Cole", "http://comics.com/john_cole"),
#(u"On A Claire Day", u"http://www.gocomics.com/onaclaireday"), # ("John Darkow", "http://comics.com/john_darkow"),
#(u"One Big Happy Classics", u"http://www.gocomics.com/onebighappyclassics"), # "John Darkow is a contributing editorial cartoonist for the Humor Times as well as editoiral cartoonist for the Columbia Daily Tribune, Missouri"
#(u"One Big Happy", u"http://www.gocomics.com/onebighappy"), # ("John Sherffius", "http://comics.com/john_sherffius"),
#(u"Out of the Gene Pool Re-Runs", u"http://www.gocomics.com/outofthegenepool"), # ("Larry Wright", "http://comics.com/larry_wright"),
#(u"Over the Hedge", u"http://www.gocomics.com/overthehedge"), # ("Lisa Benson", "http://comics.com/lisa_benson"),
#(u"Overboard", u"http://www.gocomics.com/overboard"), # ("Marshall Ramsey", "http://comics.com/marshall_ramsey"),
#(u"PC and Pixel", u"http://www.gocomics.com/pcandpixel"), # ("Matt Bors", "http://comics.com/matt_bors"),
(u"Peanuts", u"http://www.gocomics.com/peanuts"), # ("Michael Ramirez", "http://comics.com/michael_ramirez"),
#(u"Pearls Before Swine", u"http://www.gocomics.com/pearlsbeforeswine"), # ("Mike Keefe", "http://comics.com/mike_keefe"),
#(u"Pibgorn Sketches", u"http://www.gocomics.com/pibgornsketches"), # ("Mike Luckovich", "http://comics.com/mike_luckovich"),
#(u"Pibgorn", u"http://www.gocomics.com/pibgorn"), # ("MIke Thompson", "http://comics.com/mike_thompson"),
(u"Pickles", u"http://www.gocomics.com/pickles"), # ("Monte Wolverton", "http://comics.com/monte_wolverton"),
#(u"Pinkerton", u"http://www.gocomics.com/pinkerton"), # "Unique mix of perspectives"
#(u"Pluggers", u"http://www.gocomics.com/pluggers"), # ("Mr. Fish", "http://comics.com/mr_fish"),
#(u"Pooch Cafe", u"http://www.gocomics.com/poochcafe"), # "Side effects may include swelling"
#(u"PreTeena", u"http://www.gocomics.com/preteena"), # ("Nate Beeler", "http://comics.com/nate_beeler"),
#(u"Prickly City", u"http://www.gocomics.com/pricklycity"), # "Middle America meets the Beltway."
#(u"Rabbits Against Magic", u"http://www.gocomics.com/rabbitsagainstmagic"), # ("Nick Anderson", "http://comics.com/nick_anderson"),
#(u"Raising Duncan Classics", u"http://www.gocomics.com/raisingduncanclassics"), # ("Pat Bagley", "http://comics.com/pat_bagley"),
#(u"Real Life Adventures", u"http://www.gocomics.com/reallifeadventures"), # "Unfair and Totally Unbalanced."
#(u"Reality Check", u"http://www.gocomics.com/realitycheck"), # ("Paul Szep", "http://comics.com/paul_szep"),
#(u"Red and Rover", u"http://www.gocomics.com/redandrover"), # ("RJ Matson", "http://comics.com/rj_matson"),
#(u"Red Meat", u"http://www.gocomics.com/redmeat"), # "Power cartoons from NYC and Capitol Hill"
#(u"Reynolds Unwrapped", u"http://www.gocomics.com/reynoldsunwrapped"), # ("Rob Rogers", "http://comics.com/rob_rogers"),
#(u"Rip Haywire", u"http://www.gocomics.com/riphaywire"), # "Humorous slant on current events"
#(u"Ripley's Believe It or Not!", u"http://www.gocomics.com/ripleysbelieveitornot"), # ("Robert Ariail", "http://comics.com/robert_ariail"),
#(u"Ronaldinho Gaucho", u"http://www.gocomics.com/ronaldinhogaucho"), # "Clever and unpredictable"
#(u"Rose Is Rose", u"http://www.gocomics.com/roseisrose"), # ("Scott Stantis", "http://comics.com/scott_stantis"),
#(u"Rubes", u"http://www.gocomics.com/rubes"), # ("Signe Wilkinson", "http://comics.com/signe_wilkinson"),
#(u"Rudy Park", u"http://www.gocomics.com/rudypark"), # ("Steve Benson", "http://comics.com/steve_benson"),
#(u"Scary Gary", u"http://www.gocomics.com/scarygary"), # ("Steve Breen", "http://comics.com/steve_breen"),
#(u"Shirley and Son Classics", u"http://www.gocomics.com/shirleyandsonclassics"), # ("Steve Kelley", "http://comics.com/steve_kelley"),
#(u"Shoe", u"http://www.gocomics.com/shoe"), # ("Steve Sack", "http://comics.com/steve_sack"),
#(u"Shoecabbage", u"http://www.gocomics.com/shoecabbage"),
#(u"Skin Horse", u"http://www.gocomics.com/skinhorse"),
#(u"Slowpoke", u"http://www.gocomics.com/slowpoke"),
#(u"Soup To Nutz", u"http://www.gocomics.com/souptonutz"),
#(u"Speed Bump", u"http://www.gocomics.com/speedbump"),
#(u"Spot The Frog", u"http://www.gocomics.com/spotthefrog"),
#(u"State of the Union", u"http://www.gocomics.com/stateoftheunion"),
#(u"Stone Soup", u"http://www.gocomics.com/stonesoup"),
#(u"Strange Brew", u"http://www.gocomics.com/strangebrew"),
#(u"Sylvia", u"http://www.gocomics.com/sylvia"),
#(u"Tank McNamara", u"http://www.gocomics.com/tankmcnamara"),
#(u"Tarzan Classics", u"http://www.gocomics.com/tarzanclassics"),
#(u"That's Life", u"http://www.gocomics.com/thatslife"),
#(u"The Academia Waltz", u"http://www.gocomics.com/academiawaltz"),
#(u"The Argyle Sweater", u"http://www.gocomics.com/theargylesweater"),
#(u"The Barn", u"http://www.gocomics.com/thebarn"),
#(u"The Boiling Point", u"http://www.gocomics.com/theboilingpoint"),
#(u"The Boondocks", u"http://www.gocomics.com/boondocks"),
#(u"The Born Loser", u"http://www.gocomics.com/thebornloser"),
#(u"The Buckets", u"http://www.gocomics.com/thebuckets"),
#(u"The City", u"http://www.gocomics.com/thecity"),
#(u"The Dinette Set", u"http://www.gocomics.com/dinetteset"),
#(u"The Doozies", u"http://www.gocomics.com/thedoozies"),
#(u"The Duplex", u"http://www.gocomics.com/duplex"),
#(u"The Elderberries", u"http://www.gocomics.com/theelderberries"),
#(u"The Flying McCoys", u"http://www.gocomics.com/theflyingmccoys"),
#(u"The Fusco Brothers", u"http://www.gocomics.com/thefuscobrothers"),
#(u"The Grizzwells", u"http://www.gocomics.com/thegrizzwells"),
#(u"The Humble Stumble", u"http://www.gocomics.com/thehumblestumble"),
#(u"The Knight Life", u"http://www.gocomics.com/theknightlife"),
#(u"The Meaning of Lila", u"http://www.gocomics.com/meaningoflila"),
#(u"The Middletons", u"http://www.gocomics.com/themiddletons"),
#(u"The Norm", u"http://www.gocomics.com/thenorm"),
#(u"The Other Coast", u"http://www.gocomics.com/theothercoast"),
#(u"The Quigmans", u"http://www.gocomics.com/thequigmans"),
#(u"The Sunshine Club", u"http://www.gocomics.com/thesunshineclub"),
#(u"Tiny Sepuk", u"http://www.gocomics.com/tinysepuk"),
#(u"TOBY", u"http://www.gocomics.com/toby"),
#(u"Tom the Dancing Bug", u"http://www.gocomics.com/tomthedancingbug"),
#(u"Too Much Coffee Man", u"http://www.gocomics.com/toomuchcoffeeman"),
#(u"Unstrange Phenomena", u"http://www.gocomics.com/unstrangephenomena"),
#(u"W.T. Duck", u"http://www.gocomics.com/wtduck"),
#(u"Watch Your Head", u"http://www.gocomics.com/watchyourhead"),
#(u"Wee Pals", u"http://www.gocomics.com/weepals"),
#(u"Winnie the Pooh", u"http://www.gocomics.com/winniethepooh"),
#(u"Wizard of Id", u"http://www.gocomics.com/wizardofid"),
#(u"Working Daze", u"http://www.gocomics.com/workingdaze"),
#(u"Working It Out", u"http://www.gocomics.com/workingitout"),
#(u"Yenny", u"http://www.gocomics.com/yenny"),
#(u"Zack Hill", u"http://www.gocomics.com/zackhill"),
(u"Ziggy", u"http://www.gocomics.com/ziggy"),
#
######## EDITORIAL CARTOONS #####################
(u"Adam Zyglis", u"http://www.gocomics.com/adamzyglis"),
#(u"Andy Singer", u"http://www.gocomics.com/andysinger"),
#(u"Ben Sargent",u"http://www.gocomics.com/bensargent"),
#(u"Bill Day", u"http://www.gocomics.com/billday"),
#(u"Bill Schorr", u"http://www.gocomics.com/billschorr"),
#(u"Bob Englehart", u"http://www.gocomics.com/bobenglehart"),
(u"Bob Gorrell",u"http://www.gocomics.com/bobgorrell"),
#(u"Brian Fairrington", u"http://www.gocomics.com/brianfairrington"),
#(u"Bruce Beattie", u"http://www.gocomics.com/brucebeattie"),
#(u"Cam Cardow", u"http://www.gocomics.com/camcardow"),
#(u"Chan Lowe",u"http://www.gocomics.com/chanlowe"),
#(u"Chip Bok",u"http://www.gocomics.com/chipbok"),
#(u"Chris Britt",u"http://www.gocomics.com/chrisbritt"),
#(u"Chuck Asay",u"http://www.gocomics.com/chuckasay"),
#(u"Clay Bennett",u"http://www.gocomics.com/claybennett"),
#(u"Clay Jones",u"http://www.gocomics.com/clayjones"),
#(u"Dan Wasserman",u"http://www.gocomics.com/danwasserman"),
#(u"Dana Summers",u"http://www.gocomics.com/danasummers"),
#(u"Daryl Cagle", u"http://www.gocomics.com/darylcagle"),
#(u"David Fitzsimmons", u"http://www.gocomics.com/davidfitzsimmons"),
(u"Dick Locher",u"http://www.gocomics.com/dicklocher"),
#(u"Don Wright",u"http://www.gocomics.com/donwright"),
#(u"Donna Barstow",u"http://www.gocomics.com/donnabarstow"),
#(u"Drew Litton", u"http://www.gocomics.com/drewlitton"),
#(u"Drew Sheneman",u"http://www.gocomics.com/drewsheneman"),
#(u"Ed Stein", u"http://www.gocomics.com/edstein"),
#(u"Eric Allie", u"http://www.gocomics.com/ericallie"),
#(u"Gary Markstein", u"http://www.gocomics.com/garymarkstein"),
#(u"Gary McCoy", u"http://www.gocomics.com/garymccoy"),
#(u"Gary Varvel", u"http://www.gocomics.com/garyvarvel"),
#(u"Glenn McCoy",u"http://www.gocomics.com/glennmccoy"),
#(u"Henry Payne", u"http://www.gocomics.com/henrypayne"),
#(u"Jack Ohman",u"http://www.gocomics.com/jackohman"),
#(u"JD Crowe", u"http://www.gocomics.com/jdcrowe"),
#(u"Jeff Danziger",u"http://www.gocomics.com/jeffdanziger"),
#(u"Jeff Parker", u"http://www.gocomics.com/jeffparker"),
#(u"Jeff Stahler", u"http://www.gocomics.com/jeffstahler"),
#(u"Jerry Holbert", u"http://www.gocomics.com/jerryholbert"),
#(u"Jim Morin",u"http://www.gocomics.com/jimmorin"),
#(u"Joel Pett",u"http://www.gocomics.com/joelpett"),
#(u"John Cole", u"http://www.gocomics.com/johncole"),
#(u"John Darkow", u"http://www.gocomics.com/johndarkow"),
#(u"John Deering",u"http://www.gocomics.com/johndeering"),
#(u"John Sherffius", u"http://www.gocomics.com/johnsherffius"),
#(u"Ken Catalino",u"http://www.gocomics.com/kencatalino"),
#(u"Kerry Waghorn",u"http://www.gocomics.com/facesinthenews"),
#(u"Kevin Kallaugher",u"http://www.gocomics.com/kevinkallaugher"),
#(u"Lalo Alcaraz",u"http://www.gocomics.com/laloalcaraz"),
#(u"Larry Wright", u"http://www.gocomics.com/larrywright"),
#(u"Lisa Benson", u"http://www.gocomics.com/lisabenson"),
#(u"Marshall Ramsey", u"http://www.gocomics.com/marshallramsey"),
#(u"Matt Bors", u"http://www.gocomics.com/mattbors"),
#(u"Matt Davies",u"http://www.gocomics.com/mattdavies"),
#(u"Michael Ramirez", u"http://www.gocomics.com/michaelramirez"),
#(u"Mike Keefe", u"http://www.gocomics.com/mikekeefe"),
#(u"Mike Luckovich", u"http://www.gocomics.com/mikeluckovich"),
#(u"MIke Thompson", u"http://www.gocomics.com/mikethompson"),
#(u"Monte Wolverton", u"http://www.gocomics.com/montewolverton"),
#(u"Mr. Fish", u"http://www.gocomics.com/mrfish"),
#(u"Nate Beeler", u"http://www.gocomics.com/natebeeler"),
#(u"Nick Anderson", u"http://www.gocomics.com/nickanderson"),
#(u"Pat Bagley", u"http://www.gocomics.com/patbagley"),
#(u"Pat Oliphant",u"http://www.gocomics.com/patoliphant"),
#(u"Paul Conrad",u"http://www.gocomics.com/paulconrad"),
#(u"Paul Szep", u"http://www.gocomics.com/paulszep"),
#(u"RJ Matson", u"http://www.gocomics.com/rjmatson"),
#(u"Rob Rogers", u"http://www.gocomics.com/robrogers"),
#(u"Robert Ariail", u"http://www.gocomics.com/robertariail"),
#(u"Scott Stantis", u"http://www.gocomics.com/scottstantis"),
#(u"Signe Wilkinson", u"http://www.gocomics.com/signewilkinson"),
#(u"Small World",u"http://www.gocomics.com/smallworld"),
#(u"Steve Benson", u"http://www.gocomics.com/stevebenson"),
#(u"Steve Breen", u"http://www.gocomics.com/stevebreen"),
#(u"Steve Kelley", u"http://www.gocomics.com/stevekelley"),
#(u"Steve Sack", u"http://www.gocomics.com/stevesack"),
#(u"Stuart Carlson",u"http://www.gocomics.com/stuartcarlson"),
#(u"Ted Rall",u"http://www.gocomics.com/tedrall"),
#(u"(Th)ink", u"http://www.gocomics.com/think"),
#(u"Tom Toles",u"http://www.gocomics.com/tomtoles"),
(u"Tony Auth",u"http://www.gocomics.com/tonyauth"),
#(u"Views of the World",u"http://www.gocomics.com/viewsoftheworld"),
#(u"ViewsAfrica",u"http://www.gocomics.com/viewsafrica"),
#(u"ViewsAmerica",u"http://www.gocomics.com/viewsamerica"),
#(u"ViewsAsia",u"http://www.gocomics.com/viewsasia"),
#(u"ViewsBusiness",u"http://www.gocomics.com/viewsbusiness"),
#(u"ViewsEurope",u"http://www.gocomics.com/viewseurope"),
#(u"ViewsLatinAmerica",u"http://www.gocomics.com/viewslatinamerica"),
#(u"ViewsMidEast",u"http://www.gocomics.com/viewsmideast"),
(u"Walt Handelsman",u"http://www.gocomics.com/walthandelsman"),
#(u"Wayne Stayskal",u"http://www.gocomics.com/waynestayskal"),
#(u"Wit of the World",u"http://www.gocomics.com/witoftheworld"),
]: ]:
print 'Working on: ', title
articles = self.make_links(url) articles = self.make_links(url)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
return feeds return feeds
def make_links(self, url): def make_links(self, url):
title = 'Temp' soup = self.index_to_soup(url)
# print 'soup: ', soup
title = ''
current_articles = [] current_articles = []
pages = range(1, self.num_comics_to_get+1) from datetime import datetime, timedelta
for page in pages: now = datetime.now()
page_soup = self.index_to_soup(url) dates = [(now-timedelta(days=d)).strftime('%Y/%m/%d') for d in range(self.num_comics_to_get)]
if page_soup:
try: for page in dates:
strip_title = page_soup.find(name='div', attrs={'class':'top'}).h1.a.string page_url = url + '/' + str(page)
except: print(page_url)
strip_title = 'Error - no Title found' soup = self.index_to_soup(page_url)
try: if soup:
date_title = page_soup.find('ul', attrs={'class': 'feature-nav'}).li.string strip_tag = self.tag_to_string(soup.find('a'))
if not date_title: if strip_tag:
date_title = page_soup.find('ul', attrs={'class': 'feature-nav'}).li.string print 'strip_tag: ', strip_tag
except: title = strip_tag
date_title = 'Error - no Date found' print 'title: ', title
title = strip_title + ' - ' + date_title
for i in range(2):
try:
strip_url_date = page_soup.find(name='div', attrs={'class':'top'}).h1.a['href']
break #success - this is normal exit
except:
strip_url_date = None
continue #try to get strip_url_date again
for i in range(2):
try:
prev_strip_url_date = page_soup.find('a', attrs={'class': 'prev'})['href']
break #success - this is normal exit
except:
prev_strip_url_date = None
continue #try to get prev_strip_url_date again
if strip_url_date:
page_url = 'http://www.gocomics.com' + strip_url_date
else:
continue
if prev_strip_url_date:
prev_page_url = 'http://www.gocomics.com' + prev_strip_url_date
else:
continue
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''}) current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
url = prev_page_url
current_articles.reverse() current_articles.reverse()
return current_articles return current_articles
def preprocess_html(self, soup):
if soup.title:
title_string = soup.title.string.strip()
_cd = title_string.split(',',1)[1]
comic_date = ' '.join(_cd.split(' ', 4)[0:-1])
if soup.h1.span:
artist = soup.h1.span.string
soup.h1.span.string.replaceWith(comic_date + artist)
feature_item = soup.find('p',attrs={'class':'feature_item'})
if feature_item.a:
a_tag = feature_item.a
a_href = a_tag["href"]
img_tag = a_tag.img
img_tag["src"] = a_href
img_tag["width"] = self.comic_size
img_tag["height"] = None
return self.adeify_images(soup)
extra_css = ''' extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
img {max-width:100%; min-width:100%;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;} p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
''' '''

View File

@ -1,6 +1,4 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re
from datetime import date, timedelta
class HBR(BasicNewsRecipe): class HBR(BasicNewsRecipe):
@ -11,23 +9,18 @@ class HBR(BasicNewsRecipe):
timefmt = ' [%B %Y]' timefmt = ' [%B %Y]'
language = 'en' language = 'en'
no_stylesheets = True no_stylesheets = True
# recipe_disabled = ('hbr.org has started requiring the use of javascript'
# ' to log into their website. This is unsupported in calibre, so'
# ' this recipe has been disabled. If you would like to see '
# ' HBR supported in calibre, contact hbr.org and ask them'
# ' to provide a javascript free login method.')
LOGIN_URL = 'https://hbr.org/login?request_url=/' LOGIN_URL = 'https://hbr.org/login?request_url=/'
LOGOUT_URL = 'https://hbr.org/logout?request_url=/' LOGOUT_URL = 'https://hbr.org/logout?request_url=/'
INDEX = 'http://hbr.org/archive-toc/BR' INDEX = 'http://hbr.org'
keep_only_tags = [dict(name='div', id='pageContainer')] keep_only_tags = [dict(name='div', id='pageContainer')]
remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline', remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline',
'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
'mailingListTout', 'partnerCenter', 'pageFooter', 'mailingListTout', 'partnerCenter', 'pageFooter',
'superNavHeadContainer', 'hbrDisqus', 'superNavHeadContainer', 'hbrDisqus', 'article-toolbox',
'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']), 'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
dict(name='iframe')] dict(name='iframe')]
extra_css = ''' extra_css = '''
@ -57,22 +50,6 @@ class HBR(BasicNewsRecipe):
if url.endswith('/ar/1'): if url.endswith('/ar/1'):
return url[:-1]+'pr' return url[:-1]+'pr'
def hbr_get_toc(self):
# return self.index_to_soup(open('/t/toc.html').read())
today = date.today()
future = today + timedelta(days=30)
past = today - timedelta(days=30)
for x in [x.strftime('%y%m') for x in (future, today, past)]:
url = self.INDEX + x
soup = self.index_to_soup(url)
if (not soup.find(text='Issue Not Found') and not soup.find(
text="We're Sorry. There was an error processing your request")
and 'Exception: java.io.FileNotFoundException' not in
unicode(soup)):
return soup
raise Exception('Could not find current issue')
def hbr_parse_toc(self, soup): def hbr_parse_toc(self, soup):
feeds = [] feeds = []
current_section = None current_section = None
@ -105,23 +82,19 @@ class HBR(BasicNewsRecipe):
articles.append({'title':title, 'url':url, 'description':desc, articles.append({'title':title, 'url':url, 'description':desc,
'date':''}) 'date':''})
if current_section is not None and articles:
feeds.append((current_section, articles))
return feeds return feeds
def parse_index(self): def parse_index(self):
soup = self.hbr_get_toc() soup0 = self.index_to_soup('http://hbr.org/magazine')
# open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8')) datencover = soup0.find('ul', attrs={'id':'magazineArchiveCarousel'}).findAll('li')[-1]
#find date & cover
self.cover_url=datencover.img['src']
dates=self.tag_to_string(datencover.img['alt'])
self.timefmt = u' [%s]'%dates
soup = self.index_to_soup(self.INDEX + soup0.find('div', attrs = {'class':'magazine_page'}).a['href'])
feeds = self.hbr_parse_toc(soup) feeds = self.hbr_parse_toc(soup)
return feeds return feeds
def get_cover_url(self):
cover_url = None
index = 'http://hbr.org/current'
soup = self.index_to_soup(index)
link_item = soup.find('img', alt=re.compile("Current Issue"), src=True)
if link_item:
cover_url = 'http://hbr.org' + link_item['src']
return cover_url

BIN
recipes/icons/forbes_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 802 B

After

Width:  |  Height:  |  Size: 294 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 802 B

After

Width:  |  Height:  |  Size: 294 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 802 B

After

Width:  |  Height:  |  Size: 294 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 802 B

After

Width:  |  Height:  |  Size: 294 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 905 B

BIN
recipes/icons/slashdot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 250 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 511 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 205 B

View File

@ -41,6 +41,7 @@ class TheIndependentNew(BasicNewsRecipe):
publication_type = 'newspaper' publication_type = 'newspaper'
masthead_url = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png' masthead_url = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png'
encoding = 'utf-8' encoding = 'utf-8'
compress_news_images = True
remove_tags =[ remove_tags =[
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}), dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
dict(attrs={'class' : ['autoplay','openBiogPopup']}), dict(attrs={'class' : ['autoplay','openBiogPopup']}),

View File

@ -0,0 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1366025923(BasicNewsRecipe):
title = u'Lightspeed Magazine'
language = 'en'
__author__ = 'Jose Pinto'
oldest_article = 31
max_articles_per_feed = 100
auto_cleanup = True
use_embedded_content = False
feeds = [(u'Lastest Stories', u'http://www.lightspeedmagazine.com/rss-2/')]

View File

@ -36,6 +36,9 @@ from BeautifulSoup import BeautifulSoup
Changed order of regex to speedup proces Changed order of regex to speedup proces
Version 1.9.3 23-05-2012 Version 1.9.3 23-05-2012
Updated Cover image Updated Cover image
Version 1.9.4 19-04-2013
Added regex filter for mailto
Updated for new layout of metro-site
''' '''
class AdvancedUserRecipe1306097511(BasicNewsRecipe): class AdvancedUserRecipe1306097511(BasicNewsRecipe):
@ -43,7 +46,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
oldest_article = 1.2 oldest_article = 1.2
max_articles_per_feed = 25 max_articles_per_feed = 25
__author__ = u'DrMerry' __author__ = u'DrMerry'
description = u'Metro Nederland' description = u'Metro Nederland v1.9.4 2013-04-19'
language = u'nl' language = u'nl'
simultaneous_downloads = 5 simultaneous_downloads = 5
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif' masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
@ -68,13 +71,17 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
#(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em') #(re.compile('(</?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
] ]
remove_tags_before= dict(id='date') remove_tags_before= dict(id='subwrapper')
remove_tags_after = [dict(name='div', attrs={'class':['column-1-3','gallery-text']})]#id='share-and-byline')] remove_tags_after = dict(name='div', attrs={'class':['body-area','article-main-area']})
#name='div', attrs={'class':['subwrapper']})]
#'column-1-3','gallery-text']})]#id='share-and-byline')]
filter_regexps = [r'mailto:.*']
remove_tags = [ remove_tags = [
dict(name=['iframe','script','noscript','style']), dict(name=['iframe','script','noscript','style']),
dict(name='div', attrs={'class':['column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}), dict(name='div', attrs={'class':['aside clearfix','aside clearfix middle-col-line','comments','share-tools','article-right-column','column-4-5','column-1-5','ad-msg','col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)'),'promos','header-links','promo-2']}),
dict(id=['column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4','margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1']), dict(id=['article-2','googleads','column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'adadcomp-4','margin-5','sidebar',re.compile('^article-\d'),'comments','gallery-1','sharez_container','ts-container','topshares','ts-title']),
dict(name='a', attrs={'name':'comments'}), dict(name='a', attrs={'name':'comments'}),
#dict(name='div', attrs={'data-href'}), #dict(name='div', attrs={'data-href'}),
dict(name='img', attrs={'class':'top-line','title':'volledig scherm'}), dict(name='img', attrs={'class':'top-line','title':'volledig scherm'}),

View File

@ -6,10 +6,10 @@ import time
class AdvancedUserRecipe1306097511(BasicNewsRecipe): class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro UK' title = u'Metro UK'
description = 'News as provided by The Metro -UK' description = 'News from The Metro, UK'
#timefmt = '' #timefmt = ''
__author__ = 'fleclerc & Dave Asbury' __author__ = 'Dave Asbury'
#last update 20/1/13 #last update 4/4/13
#cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg' #cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1638332595/METRO_LETTERS-01.jpg' cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1638332595/METRO_LETTERS-01.jpg'
@ -22,7 +22,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
language = 'en_GB' language = 'en_GB'
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif' masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
compress_news_images = True
def parse_index(self): def parse_index(self):
articles = {} articles = {}
key = None key = None

View File

@ -1,64 +1,44 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
'''
newyorker.com
'''
'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
class NewYorker(BasicNewsRecipe): class NewYorker(BasicNewsRecipe):
title = 'The New Yorker'
__author__ = 'Darko Miletic'
description = 'The best of US journalism' title = u'New Yorker Magazine'
oldest_article = 15 newyorker_prefix = 'http://m.newyorker.com'
description = u'Content from the New Yorker website'
fp_tag = 'CAN_TC'
masthead_url = 'http://www.newyorker.com/images/elements/print/newyorker_printlogo.gif'
compress_news_images = True
compress_news_images_auto_size = 8
scale_news_images_to_device = False
scale_news_images = (768, 1024)
url_list = []
language = 'en' language = 'en'
max_articles_per_feed = 100 __author__ = 'Nick Redding'
no_stylesheets = True no_stylesheets = True
use_embedded_content = False timefmt = ' [%b %d]'
publisher = 'Conde Nast Publications' encoding = 'utf-8'
category = 'news, politics, USA' extra_css = '''
encoding = 'cp1252' .byline { font-size:xx-small; font-weight: bold;}
publication_type = 'magazine' h3 { margin-bottom: 6px; }
masthead_url = 'http://www.newyorker.com/css/i/hed/logo.gif' .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
extra_css = """ '''
body {font-family: "Times New Roman",Times,serif} keep_only_tags = [dict(name='div', attrs={'id':re.compile('pagebody')})]
.articleauthor{color: #9F9F9F;
font-family: Arial, sans-serif;
font-size: small;
text-transform: uppercase}
.rubric,.dd,h6#credit{color: #CD0021;
font-family: Arial, sans-serif;
font-size: small;
text-transform: uppercase}
.descender:first-letter{display: inline; font-size: xx-large; font-weight: bold}
.dd,h6#credit{color: gray}
.c{display: block}
.caption,h2#articleintro{font-style: italic}
.caption{font-size: small}
"""
conversion_options = { remove_tags = [{'class':'socialUtils'},{'class':'entry-keywords'}]
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'id':'pagebody'})]
remove_tags = [
dict(name=['meta','iframe','base','link','embed','object'])
,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons','social-utils-top','entry-keywords','entry-categories','utilsPrintEmail'] })
,dict(attrs={'id':['show-header','show-footer'] })
]
remove_tags_after = dict(attrs={'class':'entry-content'})
remove_attributes = ['lang']
feeds = [(u'The New Yorker', u'http://www.newyorker.com/services/mrss/feeds/everything.xml')]
def print_version(self, url):
return url + '?printable=true&currentPage=all'
def image_url_processor(self, baseurl, url):
return url.strip()
def get_cover_url(self): def get_cover_url(self):
cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg" cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg"
@ -68,13 +48,233 @@ class NewYorker(BasicNewsRecipe):
cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip() cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
return cover_url return cover_url
def preprocess_html(self, soup): def fixChars(self,string):
for item in soup.findAll(style=True): # Replace lsquo (\x91)
del item['style'] fixed = re.sub("\x91","",string)
auth = soup.find(attrs={'id':'articleauthor'}) # Replace rsquo (\x92)
if auth: fixed = re.sub("\x92","",fixed)
alink = auth.find('a') # Replace ldquo (\x93)
if alink and alink.string is not None: fixed = re.sub("\x93","“",fixed)
txt = alink.string # Replace rdquo (\x94)
alink.replaceWith(txt) fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
shortparagraph = ""
## try:
if len(article.text_summary.strip()) == 0:
articlebodies = soup.findAll('div',attrs={'class':'entry-content'})
if articlebodies:
for articlebody in articlebodies:
if articlebody:
paras = articlebody.findAll('p')
for p in paras:
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
#account for blank paragraphs and short paragraphs by appending them to longer ones
if len(refparagraph) > 0:
if len(refparagraph) > 70: #approximately one line of text
newpara = shortparagraph + refparagraph
article.summary = article.text_summary = newpara.strip()
return
else:
shortparagraph = refparagraph + " "
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
shortparagraph = shortparagraph + "- "
else:
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
## except:
## self.log("Error creating article descriptions")
## return
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup return soup
def preprocess_html(self,soup):
dateline = soup.find('div','published')
byline = soup.find('div','byline')
title = soup.find('h1','entry-title')
if title is None:
return self.strip_anchors(soup)
if byline is None:
title.append(dateline)
return self.strip_anchors(soup)
byline.append(dateline)
return self.strip_anchors(soup)
def load_global_nav(self,soup):
seclist = []
ul = soup.find('ul',attrs={'id':re.compile('global-nav-menu')})
if ul is not None:
for li in ul.findAll('li'):
if li.a is not None:
securl = li.a['href']
if securl != '/' and securl != '/magazine' and securl.startswith('/'):
seclist.append((self.tag_to_string(li.a),self.newyorker_prefix+securl))
return seclist
def exclude_url(self,url):
if url in self.url_list:
return True
if not url.endswith('html'):
return True
if 'goings-on-about-town-app' in url:
return True
if 'something-to-be-thankful-for' in url:
return True
if '/shouts/' in url:
return True
if 'out-loud' in url:
return True
if '/rss/' in url:
return True
if '/video-' in url:
return True
self.url_list.append(url)
return False
def load_index_page(self,soup):
article_list = []
for div in soup.findAll('div',attrs={'class':re.compile('^rotator')}):
h2 = div.h2
if h2 is not None:
a = h2.a
if a is not None:
url = a['href']
if not self.exclude_url(url):
if url.startswith('/'):
url = self.newyorker_prefix+url
byline = h2.span
if byline is not None:
author = self.tag_to_string(byline)
if author.startswith('by '):
author.replace('by ','')
byline.extract()
else:
author = ''
if h2.br is not None:
h2.br.replaceWith(' ')
title = self.tag_to_string(h2)
desc = div.find(attrs={'class':['rotator-ad-body','feature-blurb-text']})
if desc is not None:
description = self.tag_to_string(desc)
else:
description = ''
article_list.append(dict(title=title,url=url,date='',description=description,author=author,content=''))
ul = div.find('ul','feature-blurb-links')
if ul is not None:
for li in ul.findAll('li'):
a = li.a
if a is not None:
url = a['href']
if not self.exclude_url(url):
if url.startswith('/'):
url = self.newyorker_prefix+url
if a.br is not None:
a.br.replaceWith(' ')
title = '>>'+self.tag_to_string(a)
article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
for h3 in soup.findAll('h3','header'):
a = h3.a
if a is not None:
url = a['href']
if not self.exclude_url(url):
if url.startswith('/'):
url = self.newyorker_prefix+url
byline = h3.span
if byline is not None:
author = self.tag_to_string(byline)
if author.startswith('by '):
author = author.replace('by ','')
byline.extract()
else:
author = ''
if h3.br is not None:
h3.br.replaceWith(' ')
title = self.tag_to_string(h3).strip()
article_list.append(dict(title=title,url=url,date='',description='',author=author,content=''))
return article_list
def load_global_section(self,securl):
article_list = []
try:
soup = self.index_to_soup(securl)
except:
return article_list
if '/blogs/' not in securl:
return self.load_index_page(soup)
for div in soup.findAll('div',attrs={'id':re.compile('^entry')}):
h3 = div.h3
if h3 is not None:
a = h3.a
if a is not None:
url = a['href']
if not self.exclude_url(url):
if url.startswith('/'):
url = self.newyorker_prefix+url
if h3.br is not None:
h3.br.replaceWith(' ')
title = self.tag_to_string(h3)
article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
return article_list
def filter_ans(self, ans) :
total_article_count = 0
idx = 0
idx_max = len(ans)-1
while idx <= idx_max:
if True: #self.verbose
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
for article in ans[idx][1]:
total_article_count += 1
if True: #self.verbose
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].replace('http://m.newyorker.com','').encode('cp1252','replace')))
idx = idx+1
self.log( "Queued %d articles" % total_article_count )
return ans
def parse_index(self):
ans = []
try:
soup = self.index_to_soup(self.newyorker_prefix)
except:
return ans
seclist = self.load_global_nav(soup)
ans.append(('Front Page',self.load_index_page(soup)))
for (sectitle,securl) in seclist:
ans.append((sectitle,self.load_global_section(securl)))
return self.filter_ans(ans)

View File

@ -12,6 +12,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
max_articles_per_feed = 20 max_articles_per_feed = 20
#auto_cleanup = True #auto_cleanup = True
language = 'en_GB' language = 'en_GB'
compress_news_images = True
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('http://www.nme.com/component/subscribe') soup = self.index_to_soup('http://www.nme.com/component/subscribe')

View File

@ -11,7 +11,8 @@ class PsychologyToday(BasicNewsRecipe):
language = 'en' language = 'en'
category = 'news' category = 'news'
encoding = 'UTF-8' encoding = 'UTF-8'
keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})] auto_cleanup = True
#keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})]
no_javascript = True no_javascript = True
no_stylesheets = True no_stylesheets = True
@ -31,50 +32,32 @@ class PsychologyToday(BasicNewsRecipe):
self.timefmt = u' [%s]'%date self.timefmt = u' [%s]'%date
articles = [] articles = []
for post in div.findAll('div', attrs={'class':'collections-node-feature-info'}): for post in div.findAll('div', attrs={'class':'collections-node-feature collection-node-even'}):
title = self.tag_to_string(post.find('h2')) title = self.tag_to_string(post.find('h2'))
author_item=post.find('div', attrs={'class':'collection-node-byline'}) author_item=post.find('div', attrs={'class':'collection-node-byline'})
author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip()) author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
title = title + u' (%s)'%author title = title + u' (%s)'%author
article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) url= 'http://www.psychologytoday.com'+post.find('a', href=True)['href']
print_page=article_page.find('li', attrs={'class':'print_html first'}) #print_page=article_page.find('li', attrs={'class':'print_html first'})
url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] #url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
self.log('Found article:', title)
self.log('\t', url)
self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'','description':desc})
for post in div.findAll('div', attrs={'class':'collections-node-feature collection-node-odd'}):
title = self.tag_to_string(post.find('h2'))
author_item=post.find('div', attrs={'class':'collection-node-byline'})
author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
title = title + u' (%s)'%author
url= 'http://www.psychologytoday.com'+post.find('a', href=True)['href']
#print_page=article_page.find('li', attrs={'class':'print_html first'})
#url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip() desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
self.log('Found article:', title) self.log('Found article:', title)
self.log('\t', url) self.log('\t', url)
self.log('\t', desc) self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'','description':desc}) articles.append({'title':title, 'url':url, 'date':'','description':desc})
for post in div.findAll('div', attrs={'class':'collections-node-thumbnail-info'}):
title = self.tag_to_string(post.find('h2'))
author_item=post.find('div', attrs={'class':'collection-node-byline'})
article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
print_page=article_page.find('li', attrs={'class':'print_html first'})
description = post.find('div', attrs={'class':'collection-node-description'})
author = re.sub(r'.*by\s',"",self.tag_to_string(description.nextSibling).strip())
desc = self.tag_to_string(description).strip()
url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
title = title + u' (%s)'%author
self.log('Found article:', title)
self.log('\t', url)
self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'','description':desc})
for post in div.findAll('li', attrs={'class':['collection-item-list-odd','collection-item-list-even']}):
title = self.tag_to_string(post.find('h2'))
author_item=post.find('div', attrs={'class':'collection-node-byline'})
author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
title = title + u' (%s)'%author
article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
print_page=article_page.find('li', attrs={'class':'print_html first'})
if print_page is not None:
url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
self.log('Found article:', title)
self.log('\t', url)
self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'','description':desc})
return [('Current Issue', articles)] return [('Current Issue', articles)]

View File

@ -7,7 +7,6 @@ sfgate.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re
class SanFranciscoChronicle(BasicNewsRecipe): class SanFranciscoChronicle(BasicNewsRecipe):
title = u'San Francisco Chronicle' title = u'San Francisco Chronicle'
@ -19,16 +18,7 @@ class SanFranciscoChronicle(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
auto_cleanup = True
remove_tags_before = {'id':'printheader'}
remove_tags = [
dict(name='div',attrs={'id':'printheader'})
,dict(name='a', attrs={'href':re.compile('http://ads\.pheedo\.com.*')})
,dict(name='div',attrs={'id':'footer'})
]
extra_css = ''' extra_css = '''
h1{font-family :Arial,Helvetica,sans-serif; font-size:large;} h1{font-family :Arial,Helvetica,sans-serif; font-size:large;}
@ -43,33 +33,13 @@ class SanFranciscoChronicle(BasicNewsRecipe):
''' '''
feeds = [ feeds = [
(u'Top News Stories', u'http://www.sfgate.com/rss/feeds/news.xml') (u'Bay Area News', u'http://www.sfgate.com/bayarea/feed/Bay-Area-News-429.php'),
(u'City Insider', u'http://www.sfgate.com/default/feed/City-Insider-Blog-573.php'),
(u'Crime Scene', u'http://www.sfgate.com/rss/feed/Crime-Scene-Blog-599.php'),
(u'Education News', u'http://www.sfgate.com/education/feed/Education-News-from-SFGate-430.php'),
(u'National News', u'http://www.sfgate.com/rss/feed/National-News-RSS-Feed-435.php'),
(u'Weird News', u'http://www.sfgate.com/weird/feed/Weird-News-RSS-Feed-433.php'),
(u'World News', u'http://www.sfgate.com/rss/feed/World-News-From-SFGate-432.php'),
] ]
def print_version(self,url):
url= url +"&type=printable"
return url
def get_article_url(self, article):
print str(article['title_detail']['value'])
url = article.get('guid',None)
url = "http://www.sfgate.com/cgi-bin/article.cgi?f="+url
if "Presented By:" in str(article['title_detail']['value']):
url = ''
return url

View File

@ -50,6 +50,10 @@ class ScienceNewsIssue(BasicNewsRecipe):
dict(name='ul', attrs={'id':'toc'}) dict(name='ul', attrs={'id':'toc'})
] ]
remove_tags= [ dict(name='a', attrs={'class':'enlarge print-no'}),
dict(name='a', attrs={'rel':'shadowbox'})
]
feeds = [(u"Science News Current Issues", u'http://www.sciencenews.org/view/feed/type/edition/name/issues.rss')] feeds = [(u"Science News Current Issues", u'http://www.sciencenews.org/view/feed/type/edition/name/issues.rss')]
match_regexps = [ match_regexps = [
@ -57,6 +61,12 @@ class ScienceNewsIssue(BasicNewsRecipe):
r'www.sciencenews.org/view/generic/id' r'www.sciencenews.org/view/generic/id'
] ]
def image_url_processor(self, baseurl, url):
x = url.split('/')
if x[4] == u'scale':
url = u'http://www.sciencenews.org/view/download/id/' + x[6] + u'/name/' + x[-1]
return url
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None
index = 'http://www.sciencenews.org/view/home' index = 'http://www.sciencenews.org/view/home'
@ -64,7 +74,6 @@ class ScienceNewsIssue(BasicNewsRecipe):
link_item = soup.find(name = 'img',alt = "issue") link_item = soup.find(name = 'img',alt = "issue")
if link_item: if link_item:
cover_url = 'http://www.sciencenews.org' + link_item['src'] + '.jpg' cover_url = 'http://www.sciencenews.org' + link_item['src'] + '.jpg'
return cover_url return cover_url
def preprocess_html(self, soup): def preprocess_html(self, soup):

View File

@ -0,0 +1,70 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.magick import Image
class sportowefakty(BasicNewsRecipe):
title = u'SportoweFakty'
__author__ = 'Artur Stachecki <artur.stachecki@gmail.com>, Tomasz Długosz <tomek3d@gmail.com>'
language = 'pl'
description = u'Najważniejsze informacje sportowe z kraju i ze świata, relacje, komentarze, wywiady, zdjęcia!'
oldest_article = 1
masthead_url='http://www.sportowefakty.pl/images/logo.png'
max_articles_per_feed = 100
simultaneous_downloads = 5
use_embedded_content=False
remove_javascript=True
no_stylesheets=True
ignore_duplicate_articles = {'title', 'url'}
keep_only_tags = [dict(attrs = {'class' : 'box-article'})]
remove_tags =[]
remove_tags.append(dict(attrs = {'class' : re.compile(r'^newsStream')}))
remove_tags.append(dict(attrs = {'target' : '_blank'}))
feeds = [
(u'Piłka Nożna', u'http://www.sportowefakty.pl/pilka-nozna/index.rss'),
(u'Koszykówka', u'http://www.sportowefakty.pl/koszykowka/index.rss'),
(u'Żużel', u'http://www.sportowefakty.pl/zuzel/index.rss'),
(u'Siatkówka', u'http://www.sportowefakty.pl/siatkowka/index.rss'),
(u'Zimowe', u'http://www.sportowefakty.pl/zimowe/index.rss'),
(u'Hokej', u'http://www.sportowefakty.pl/hokej/index.rss'),
(u'Moto', u'http://www.sportowefakty.pl/moto/index.rss'),
(u'Tenis', u'http://www.sportowefakty.pl/tenis/index.rss')
]
def get_article_url(self, article):
link = article.get('link', None)
if 'utm_source' in link:
return link.split('?utm')[0]
else:
return link
def print_version(self, url):
print_url = url + '/drukuj'
return print_url
def preprocess_html(self, soup):
head = soup.find('h1')
if 'Fotorelacja' in self.tag_to_string(head):
return None
else:
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup
def postprocess_html(self, soup, first):
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup

View File

@ -1,8 +1,11 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini' __author__ = 'Lorenzo Vigentini and Tom Surace'
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>' __copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>, 2013 Tom Surace <tekhedd@byteheaven.net>'
description = 'the Escapist Magazine - v1.02 (09, January 2010)' description = 'The Escapist Magazine - v1.3 (2013, April 2013)'
#
# Based on 'the Escapist Magazine - v1.02 (09, January 2010)'
''' '''
http://www.escapistmagazine.com/ http://www.escapistmagazine.com/
@ -11,12 +14,11 @@ http://www.escapistmagazine.com/
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class al(BasicNewsRecipe): class al(BasicNewsRecipe):
author = 'Lorenzo Vigentini' author = 'Lorenzo Vigentini and Tom Surace'
description = 'The Escapist Magazine' description = 'The Escapist Magazine'
cover_url = 'http://cdn.themis-media.com/themes/escapistmagazine/default/images/logo.png' cover_url = 'http://cdn.themis-media.com/themes/escapistmagazine/default/images/logo.png'
title = u'The Escapist Magazine' title = u'The Escapist Magazine'
publisher = 'Themis media' publisher = 'Themis Media'
category = 'Video games news, lifestyle, gaming culture' category = 'Video games news, lifestyle, gaming culture'
language = 'en' language = 'en'
@ -36,18 +38,19 @@ class al(BasicNewsRecipe):
] ]
def print_version(self,url): def print_version(self,url):
# Expect article url in the format:
# http://www.escapistmagazine.com/news/view/123198-article-name?utm_source=rss&utm_medium=rss&utm_campaign=news
#
baseURL='http://www.escapistmagazine.com' baseURL='http://www.escapistmagazine.com'
segments = url.split('/') segments = url.split('/')
#basename = '/'.join(segments[:3]) + '/'
subPath= '/'+ segments[3] + '/' subPath= '/'+ segments[3] + '/'
articleURL=(segments[len(segments)-1])[0:5]
if articleURL[4] =='-': # The article number is the "number" that starts the name
articleURL=articleURL[:4] articleNumber = segments[len(segments)-1]; # the "article name"
articleNumber = articleNumber.split('-')[0]; # keep part before hyphen
printVerString='print/'+ articleURL fullUrl = baseURL + subPath + 'print/' + articleNumber
s= baseURL + subPath + printVerString return fullUrl
return s
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'id':'article'}) dict(name='div', attrs={'id':'article'})

View File

@ -0,0 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1365777047(BasicNewsRecipe):
title = u'The Feature'
__author__ = 'Jose Pinto'
language = 'en'
oldest_article = 30
max_articles_per_feed = 100
auto_cleanup = True
use_embedded_content = False
feeds = [(u'Latest', u'http://thefeature.net/rss/links')]

View File

@ -20,7 +20,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'title','url'} ignore_duplicate_articles = {'title','url'}
compress_news_images = True
extra_css = ''' extra_css = '''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009-2013, Darko Miletic <darko.miletic at gmail.com>'
''' '''
theonion.com theonion.com
@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class TheOnion(BasicNewsRecipe): class TheOnion(BasicNewsRecipe):
title = 'The Onion' title = 'The Onion'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = "America's finest news source" description = "The Onion, America's Finest News Source, is an award-winning publication covering world, national, and * local issues. It is updated daily online and distributed weekly in select American cities."
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
publisher = 'Onion, Inc.' publisher = 'Onion, Inc.'
@ -20,7 +20,8 @@ class TheOnion(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
encoding = 'utf-8' encoding = 'utf-8'
publication_type = 'newsportal' publication_type = 'newsportal'
masthead_url = 'http://o.onionstatic.com/img/headers/onion_190.png' needs_subscription = 'optional'
masthead_url = 'http://www.theonion.com/static/onion/img/logo_1x.png'
extra_css = """ extra_css = """
body{font-family: Helvetica,Arial,sans-serif} body{font-family: Helvetica,Arial,sans-serif}
.section_title{color: gray; text-transform: uppercase} .section_title{color: gray; text-transform: uppercase}
@ -37,17 +38,11 @@ class TheOnion(BasicNewsRecipe):
, 'language' : language , 'language' : language
} }
keep_only_tags = [ keep_only_tags = [dict(attrs={'class':'full-article'})]
dict(name='h2', attrs={'class':['section_title','title']})
,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']})
,dict(attrs={'id':['entries']})
]
remove_attributes = ['lang','rel'] remove_attributes = ['lang','rel']
remove_tags_after = dict(attrs={'class':['article_body','feature_content']})
remove_tags = [ remove_tags = [
dict(name=['object','link','iframe','base','meta']) dict(name=['object','link','iframe','base','meta'])
,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']}) ,dict(attrs={'class':lambda x: x and 'share-tools' in x.split()})
,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']})
] ]
@ -56,6 +51,17 @@ class TheOnion(BasicNewsRecipe):
,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' ) ,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
] ]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.open('http://www.theonion.com/')
if self.username is not None and self.password is not None:
br.open('https://ui.ppjol.com/login/onion/u/j_spring_security_check')
br.select_form(name='f')
br['j_username'] = self.username
br['j_password'] = self.password
br.submit()
return br
def get_article_url(self, article): def get_article_url(self, article):
artl = BasicNewsRecipe.get_article_url(self, article) artl = BasicNewsRecipe.get_article_url(self, article)
if artl.startswith('http://www.theonion.com/audio/'): if artl.startswith('http://www.theonion.com/audio/'):
@ -79,4 +85,8 @@ class TheOnion(BasicNewsRecipe):
else: else:
str = self.tag_to_string(item) str = self.tag_to_string(item)
item.replaceWith(str) item.replaceWith(str)
for item in soup.findAll('img'):
if item.has_key('data-src'):
item['src'] = item['data-src']
return soup return soup

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
''' '''
tomshardware.com/us tomshardware.com/us
''' '''
@ -16,21 +14,19 @@ class Tomshardware(BasicNewsRecipe):
publisher = "Tom's Hardware" publisher = "Tom's Hardware"
category = 'news, IT, hardware, USA' category = 'news, IT, hardware, USA'
no_stylesheets = True no_stylesheets = True
needs_subscription = True needs_subscription = 'optional'
language = 'en' language = 'en'
INDEX = 'http://www.tomshardware.com' INDEX = 'http://www.tomshardware.com'
LOGIN = INDEX + '/membres/' LOGIN = INDEX + '/membres/'
remove_javascript = True remove_javascript = True
use_embedded_content= False use_embedded_content= False
html2lrf_options = [ conversion_options = {
'--comment', description 'comment' : description
, '--category', category , 'tags' : category
, '--publisher', publisher , 'publisher' : publisher
] , 'language' : language
}
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
@ -50,8 +46,8 @@ class Tomshardware(BasicNewsRecipe):
] ]
feeds = [ feeds = [
(u'Latest Articles', u'http://www.tomshardware.com/feeds/atom/tom-s-hardware-us,18-2.xml' ) (u'Reviews', u'http://www.tomshardware.com/feeds/rss2/tom-s-hardware-us,18-2.xml')
,(u'Latest News' , u'http://www.tomshardware.com/feeds/atom/tom-s-hardware-us,18-1.xml') ,(u'News' , u'http://www.tomshardware.com/feeds/rss2/tom-s-hardware-us,18-1.xml')
] ]
def print_version(self, url): def print_version(self, url):

View File

@ -1,5 +1,6 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class TVXS(BasicNewsRecipe): class TVXS(BasicNewsRecipe):
@ -8,19 +9,30 @@ class TVXS(BasicNewsRecipe):
description = 'News from Greece' description = 'News from Greece'
max_articles_per_feed = 100 max_articles_per_feed = 100
oldest_article = 3 oldest_article = 3
simultaneous_downloads = 1
publisher = 'TVXS' publisher = 'TVXS'
category = 'news, GR' category = 'news, sport, greece'
language = 'el' language = 'el'
encoding = None encoding = None
use_embedded_content = False use_embedded_content = False
remove_empty_feeds = True remove_empty_feeds = True
#conversion_options = { 'linearize_tables': True} conversion_options = {'smarten_punctuation': True}
no_stylesheets = True no_stylesheets = True
publication_type = 'newspaper'
remove_tags_before = dict(name='h1',attrs={'class':'print-title'}) remove_tags_before = dict(name='h1',attrs={'class':'print-title'})
remove_tags_after = dict(name='div',attrs={'class':'field field-type-relevant-content field-field-relevant-articles'}) remove_tags_after = dict(name='div',attrs={'class':'field field-type-relevant-content field-field-relevant-articles'})
remove_attributes = ['width', 'src', 'header', 'footer'] remove_tags = [dict(name='div',attrs={'class':'field field-type-relevant-content field-field-relevant-articles'}),
dict(name='div',attrs={'class':'field field-type-filefield field-field-image-gallery'}),
dict(name='div',attrs={'class':'filefield-file'})]
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { text-align: center; font-size: 125%; font-weight: bold; } \
h2, h3, h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }'
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''), (re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
feeds = [(u'Ελλάδα', 'http://tvxs.gr/feeds/2/feed.xml'), feeds = [(u'Ελλάδα', 'http://tvxs.gr/feeds/2/feed.xml'),
(u'Κόσμος', 'http://tvxs.gr/feeds/5/feed.xml'), (u'Κόσμος', 'http://tvxs.gr/feeds/5/feed.xml'),
@ -35,17 +47,10 @@ class TVXS(BasicNewsRecipe):
(u'Ιστορία', 'http://tvxs.gr/feeds/1573/feed.xml'), (u'Ιστορία', 'http://tvxs.gr/feeds/1573/feed.xml'),
(u'Χιούμορ', 'http://tvxs.gr/feeds/692/feed.xml')] (u'Χιούμορ', 'http://tvxs.gr/feeds/692/feed.xml')]
def print_version(self, url): def print_version(self, url):
import urllib2, urlparse, StringIO, gzip br = self.get_browser()
response = br.open(url)
fp = urllib2.urlopen(url) data = response.read()
data = fp.read()
if fp.info()['content-encoding'] == 'gzip':
gzip_data = StringIO.StringIO(data)
gzipper = gzip.GzipFile(fileobj=gzip_data)
data = gzipper.read()
fp.close()
pos_1 = data.find('<a href="/print/') pos_1 = data.find('<a href="/print/')
if pos_1 == -1: if pos_1 == -1:
@ -57,5 +62,5 @@ class TVXS(BasicNewsRecipe):
pos_1 += len('<a href="') pos_1 += len('<a href="')
new_url = data[pos_1:pos_2] new_url = data[pos_1:pos_2]
print_url = urlparse.urljoin(url, new_url) print_url = "http://tvxs.gr" + new_url
return print_url return print_url

View File

@ -0,0 +1,17 @@
from calibre.web.feeds.news import BasicNewsRecipe
class UniverseToday(BasicNewsRecipe):
title = u'Universe Today'
language = 'en'
description = u'Space and astronomy news.'
__author__ = 'seird'
publisher = u'universetoday.com'
category = 'science, astronomy, news, rss'
oldest_article = 7
max_articles_per_feed = 40
auto_cleanup = True
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
feeds = [(u'Universe Today', u'http://feeds.feedburner.com/universetoday/pYdq')]

View File

@ -6,17 +6,62 @@ __license__ = 'GPL v3'
www.canada.com www.canada.com
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
class TimesColonist(BasicNewsRecipe): class TimesColonist(BasicNewsRecipe):
# Customization -- remove sections you don't want.
# If your e-reader is an e-ink Kindle and your output profile is
# set properly this recipe will not include images because the
# resulting file is too large. If you have one of these and want
# images you can set kindle_omit_images = False
# and remove sections (typically the e-ink Kindles will
# work with about a dozen of these, but your mileage may vary).
kindle_omit_images = True
section_list = [
('','Web Front Page'),
('news/','News Headlines'),
('news/b-c/','BC News'),
('news/national/','National News'),
('news/world/','World News'),
('opinion/','Opinion'),
('opinion/letters/','Letters'),
('business/','Business'),
('business/money/','Money'),
('business/technology/','Technology'),
('business/working/','Working'),
('sports/','Sports'),
('sports/hockey/','Hockey'),
('sports/football/','Football'),
('sports/basketball/','Basketball'),
('sports/golf/','Golf'),
('entertainment/','entertainment'),
('entertainment/go/','Go!'),
('entertainment/music/','Music'),
('entertainment/books/','Books'),
('entertainment/Movies/','Movies'),
('entertainment/television/','Television'),
('life/','Life'),
('life/health/','Health'),
('life/travel/','Travel'),
('life/driving/','Driving'),
('life/homes/','Homes'),
('life/food-drink/','Food & Drink')
]
title = u'Victoria Times Colonist' title = u'Victoria Times Colonist'
url_prefix = 'http://www.timescolonist.com' url_prefix = 'http://www.timescolonist.com'
description = u'News from Victoria, BC' description = u'News from Victoria, BC'
fp_tag = 'CAN_TC' fp_tag = 'CAN_TC'
masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'
url_list = [] url_list = []
language = 'en_CA' language = 'en_CA'
__author__ = 'Nick Redding' __author__ = 'Nick Redding'
@ -29,15 +74,21 @@ class TimesColonist(BasicNewsRecipe):
.caption { font-size: xx-small; font-style: italic; font-weight: normal; } .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
''' '''
keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})] keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
remove_tags = [{'class':'comments'},
def __init__(self, options, log, progress_reporter):
self.remove_tags = [{'class':'comments'},
{'id':'photocredit'}, {'id':'photocredit'},
dict(name='div', attrs={'class':re.compile('top.controls')}), dict(name='div', attrs={'class':re.compile('top.controls')}),
dict(name='div', attrs={'class':re.compile('^comments')}),
dict(name='div', attrs={'class':re.compile('social')}), dict(name='div', attrs={'class':re.compile('social')}),
dict(name='div', attrs={'class':re.compile('tools')}), dict(name='div', attrs={'class':re.compile('tools')}),
dict(name='div', attrs={'class':re.compile('bottom.tools')}), dict(name='div', attrs={'class':re.compile('bottom.tools')}),
dict(name='div', attrs={'class':re.compile('window')}), dict(name='div', attrs={'class':re.compile('window')}),
dict(name='div', attrs={'class':re.compile('related.news.element')})] dict(name='div', attrs={'class':re.compile('related.news.element')})]
print("PROFILE NAME = "+options.output_profile.short_name)
if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
self.remove_tags.append(dict(name='div', attrs={'class':re.compile('image-container')}))
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
def get_cover_url(self): def get_cover_url(self):
from datetime import timedelta, date from datetime import timedelta, date
@ -122,7 +173,6 @@ class TimesColonist(BasicNewsRecipe):
def preprocess_html(self,soup): def preprocess_html(self,soup):
byline = soup.find('p',attrs={'class':re.compile('ancillary')}) byline = soup.find('p',attrs={'class':re.compile('ancillary')})
if byline is not None: if byline is not None:
byline.find('a')
authstr = self.tag_to_string(byline,False) authstr = self.tag_to_string(byline,False)
authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE) authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE) authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
@ -149,9 +199,10 @@ class TimesColonist(BasicNewsRecipe):
atag = htag.a atag = htag.a
if atag is not None: if atag is not None:
url = atag['href'] url = atag['href']
#print("Checking "+url) url = url.strip()
if atag['href'].startswith('/'): # print("Checking >>"+url+'<<\n\r')
url = self.url_prefix+atag['href'] if url.startswith('/'):
url = self.url_prefix+url
if url in self.url_list: if url in self.url_list:
return return
self.url_list.append(url) self.url_list.append(url)
@ -171,10 +222,10 @@ class TimesColonist(BasicNewsRecipe):
if dtag is not None: if dtag is not None:
description = self.tag_to_string(dtag,False) description = self.tag_to_string(dtag,False)
article_list.append(dict(title=title,url=url,date='',description=description,author='',content='')) article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
#print(sectitle+title+": description = "+description+" URL="+url) print(sectitle+title+": description = "+description+" URL="+url+'\n\r')
def add_section_index(self,ans,securl,sectitle): def add_section_index(self,ans,securl,sectitle):
print("Add section url="+self.url_prefix+'/'+securl) print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
try: try:
soup = self.index_to_soup(self.url_prefix+'/'+securl) soup = self.index_to_soup(self.url_prefix+'/'+securl)
except: except:
@ -193,33 +244,7 @@ class TimesColonist(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
ans = [] ans = []
ans = self.add_section_index(ans,'','Web Front Page') for (url,title) in self.section_list:
ans = self.add_section_index(ans,'news/','News Headlines') ans = self.add_section_index(ans,url,title)
ans = self.add_section_index(ans,'news/b-c/','BC News')
ans = self.add_section_index(ans,'news/national/','Natioanl News')
ans = self.add_section_index(ans,'news/world/','World News')
ans = self.add_section_index(ans,'opinion/','Opinion')
ans = self.add_section_index(ans,'opinion/letters/','Letters')
ans = self.add_section_index(ans,'business/','Business')
ans = self.add_section_index(ans,'business/money/','Money')
ans = self.add_section_index(ans,'business/technology/','Technology')
ans = self.add_section_index(ans,'business/working/','Working')
ans = self.add_section_index(ans,'sports/','Sports')
ans = self.add_section_index(ans,'sports/hockey/','Hockey')
ans = self.add_section_index(ans,'sports/football/','Football')
ans = self.add_section_index(ans,'sports/basketball/','Basketball')
ans = self.add_section_index(ans,'sports/golf/','Golf')
ans = self.add_section_index(ans,'entertainment/','entertainment')
ans = self.add_section_index(ans,'entertainment/go/','Go!')
ans = self.add_section_index(ans,'entertainment/music/','Music')
ans = self.add_section_index(ans,'entertainment/books/','Books')
ans = self.add_section_index(ans,'entertainment/Movies/','movies')
ans = self.add_section_index(ans,'entertainment/television/','Television')
ans = self.add_section_index(ans,'life/','Life')
ans = self.add_section_index(ans,'life/health/','Health')
ans = self.add_section_index(ans,'life/travel/','Travel')
ans = self.add_section_index(ans,'life/driving/','Driving')
ans = self.add_section_index(ans,'life/homes/','Homes')
ans = self.add_section_index(ans,'life/food-drink/','Food & Drink')
return ans return ans

View File

@ -0,0 +1,27 @@
from calibre.web.feeds.news import BasicNewsRecipe
class HindustanTimes(BasicNewsRecipe):
title = u'Voice of America'
language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 15 #days
max_articles_per_feed = 25
#encoding = 'cp1252'
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
('All Zones',
'http://learningenglish.voanews.com/rss/?count=20'),
('World',
'http://learningenglish.voanews.com/rss/?count=20&zoneid=957'),
('USA',
'http://learningenglish.voanews.com/rss/?count=20&zoneid=958'),
('Health',
'http://learningenglish.voanews.com/rss/?count=20&zoneid=955'),
]

View File

@ -1,144 +0,0 @@
#!/usr/bin/env python
from calibre.web.feeds.recipes import BasicNewsRecipe
class GazetaWyborczaDuzyForma(BasicNewsRecipe):
cover_url = 'http://bi.gazeta.pl/im/8/5415/m5415058.gif'
title = u"Gazeta Wyborcza Duzy Format"
__author__ = 'ravcio - rlelusz[at]gmail.com'
description = u"Articles from Gazeta's website"
language = 'pl'
max_articles_per_feed = 50 #you can increade it event up to maybe 600, should still work
recursions = 0
encoding = 'iso-8859-2'
no_stylesheets = True
remove_javascript = True
use_embedded_content = False
keep_only_tags = [
dict(name='div', attrs={'id':['k1']})
]
remove_tags = [
dict(name='div', attrs={'class':['zdjM', 'rel_video', 'zdjP', 'rel_box', 'index mod_zi_dolStrony']})
,dict(name='div', attrs={'id':['source', 'banP4', 'article_toolbar', 'rel', 'inContext_disabled']})
,dict(name='ul', attrs={'id':['articleToolbar']})
,dict(name='img', attrs={'class':['brand']})
,dict(name='h5', attrs={'class':['author']})
,dict(name='h6', attrs={'class':['date']})
,dict(name='p', attrs={'class':['txt_upl']})
]
remove_tags_after = [
dict(name='div', attrs={'id':['Str']}) #nawigator numerow linii
]
def load_article_links(self, url, count):
print '--- load_article_links', url, count
#page with link to articles
soup = self.index_to_soup(url)
#table with articles
list = soup.find('div', attrs={'class':'GWdalt'})
#single articles (link, title, ...)
links = list.findAll('div', attrs={'class':['GWdaltE']})
if len(links) < count:
#load links to more articles...
#remove new link
pages_nav = list.find('div', attrs={'class':'pages'})
next = pages_nav.find('a', attrs={'class':'next'})
if next:
print 'next=', next['href']
url = 'http://wyborcza.pl' + next['href']
#e.g. url = 'http://wyborcza.pl/0,75480.html?str=2'
older_links = self.load_article_links(url, count - len(links))
links.extend(older_links)
return links
#produce list of articles to download
def parse_index(self):
print '--- parse_index'
max_articles = 8000
links = self.load_article_links('http://wyborcza.pl/0,75480.html', max_articles)
ans = []
key = None
articles = {}
key = 'Uncategorized'
articles[key] = []
for div_art in links:
div_date = div_art.find('div', attrs={'class':'kL'})
div = div_art.find('div', attrs={'class':'kR'})
a = div.find('a', href=True)
url = a['href']
title = a.string
description = ''
pubdate = div_date.string.rstrip().lstrip()
summary = div.find('span', attrs={'class':'lead'})
desc = summary.find('a', href=True)
if desc:
desc.extract()
description = self.tag_to_string(summary, use_alt=False)
description = description.rstrip().lstrip()
feed = key if key is not None else 'Duzy Format'
if not articles.has_key(feed):
articles[feed] = []
if description != '': # skip just pictures atricle
articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description,
content=''))
ans = [(key, articles[key])]
return ans
def append_page(self, soup, appendtag, position):
pager = soup.find('div',attrs={'id':'Str'})
if pager:
#seek for 'a' element with nast value (if not found exit)
list = pager.findAll('a')
for elem in list:
if 'nast' in elem.string:
nexturl = elem['href']
soup2 = self.index_to_soup('http://warszawa.gazeta.pl' + nexturl)
texttag = soup2.find('div', attrs={'id':'artykul'})
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
# finally remove some tags
pager = soup.find('div',attrs={'id':'Str'})
if pager:
pager.extract()
pager = soup.find('div',attrs={'class':'tylko_int'})
if pager:
pager.extract()
return soup

View File

@ -0,0 +1,57 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
class WysokieObcasyRecipe(BasicNewsRecipe):
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
language = 'pl'
version = 1
title = u'Wysokie Obcasy'
publisher = 'Agora SA'
description = u'Serwis sobotniego dodatku do Gazety Wyborczej'
category='magazine'
language = 'pl'
publication_type = 'magazine'
cover_url=''
remove_empty_feeds= True
no_stylesheets=True
oldest_article = 7
max_articles_per_feed = 100000
recursions = 0
no_stylesheets = True
remove_javascript = True
simultaneous_downloads = 5
keep_only_tags =[]
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article'}))
remove_tags =[]
remove_tags.append(dict(name = 'img'))
remove_tags.append(dict(name = 'p', attrs = {'class' : 'info'}))
extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
h1{text-align: left;}
'''
feeds = [
('Wszystkie Artykuly', 'feed://www.wysokieobcasy.pl/pub/rss/wysokieobcasy.xml'),
]
def print_version(self,url):
baseURL='http://www.wysokieobcasy.pl/wysokie-obcasy'
segments = url.split(',')
subPath= '/2029020,'
articleURL1 = segments[1]
articleURL2 = segments[2]
printVerString=articleURL1 + ',' + articleURL2
s= baseURL + subPath + printVerString + '.html'
return s
def get_cover_url(self):
soup = self.index_to_soup('http://www.wysokieobcasy.pl/wysokie-obcasy/0,0.html')
self.cover_url = soup.find(attrs={'class':'holder_cr'}).find('img')['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -390,7 +390,6 @@
<xsl:output method = "xml"/> <xsl:output method = "xml"/>
<xsl:key name="style-types" match="rtf:paragraph-definition" use="@style-number"/> <xsl:key name="style-types" match="rtf:paragraph-definition" use="@style-number"/>
@ -415,13 +414,11 @@
</xsl:template> </xsl:template>
<xsl:template match="rtf:page-break"> <xsl:template match="rtf:page-break">
<xsl:element name="br"> <br style = "page-break-after:always"/>
<xsl:attribute name="style">page-break-after:always</xsl:attribute>
</xsl:element>
</xsl:template> </xsl:template>
<xsl:template match="rtf:hardline-break"> <xsl:template match="rtf:hardline-break">
<xsl:element name="br"/> <br/>
</xsl:template> </xsl:template>
<xsl:template match="rtf:rtf-definition|rtf:font-table|rtf:color-table|rtf:style-table|rtf:page-definition|rtf:list-table|rtf:override-table|rtf:override-list|rtf:list-text"/> <xsl:template match="rtf:rtf-definition|rtf:font-table|rtf:color-table|rtf:style-table|rtf:page-definition|rtf:list-table|rtf:override-table|rtf:override-list|rtf:list-text"/>
@ -451,8 +448,15 @@
<xsl:template match = "rtf:field[@type='hyperlink']"> <xsl:template match = "rtf:field[@type='hyperlink']">
<xsl:element name ="a"> <xsl:element name ="a">
<xsl:attribute name = "href"> <xsl:attribute name = "href">
<xsl:choose>
<xsl:when test="@argument">
<xsl:value-of select="@argument"/>
</xsl:when>
<xsl:otherwise>
<xsl:if test = "not(contains(@link, '/'))">#</xsl:if> <xsl:if test = "not(contains(@link, '/'))">#</xsl:if>
<xsl:value-of select = "@link"/> <xsl:value-of select = "@link"/>
</xsl:otherwise>
</xsl:choose>
</xsl:attribute> </xsl:attribute>
<xsl:apply-templates/> <xsl:apply-templates/>
</xsl:element> </xsl:element>
@ -472,9 +476,7 @@
</xsl:template> </xsl:template>
<xsl:template match="rtf:pict"> <xsl:template match="rtf:pict">
<xsl:element name="img"> <img src = "{@num}"/>
<xsl:attribute name="src"><xsl:value-of select="@num" /></xsl:attribute>
</xsl:element>
</xsl:template> </xsl:template>
<xsl:template match="*"> <xsl:template match="*">

View File

@ -1,6 +1,3 @@
" Project wide builtins
let $PYFLAKES_BUILTINS = "_,dynamic_property,__,P,I,lopen,icu_lower,icu_upper,icu_title,ngettext"
" Include directories for C++ modules " Include directories for C++ modules
let g:syntastic_cpp_include_dirs = [ let g:syntastic_cpp_include_dirs = [
\'/usr/include/python2.7', \'/usr/include/python2.7',

4
setup.cfg Normal file
View File

@ -0,0 +1,4 @@
[flake8]
max-line-length = 160
builtins = _,dynamic_property,__,P,I,lopen,icu_lower,icu_upper,icu_title,ngettext
ignore = E12,E22,E231,E301,E302,E304,E401,W391

View File

@ -24,38 +24,10 @@ class Message:
def __str__(self): def __str__(self):
return '%s:%s: %s' % (self.filename, self.lineno, self.msg) return '%s:%s: %s' % (self.filename, self.lineno, self.msg)
def check_for_python_errors(code_string, filename):
import _ast
# First, compile into an AST and handle syntax errors.
try:
tree = compile(code_string, filename, "exec", _ast.PyCF_ONLY_AST)
except (SyntaxError, IndentationError) as value:
msg = value.args[0]
(lineno, offset, text) = value.lineno, value.offset, value.text
# If there's an encoding problem with the file, the text is None.
if text is None:
# Avoid using msg, since for the only known case, it contains a
# bogus message that claims the encoding the file declared was
# unknown.
msg = "%s: problem decoding source" % filename
return [Message(filename, lineno, msg)]
else:
checker = __import__('pyflakes.checker').checker
# Okay, it's syntactically valid. Now check it.
w = checker.Checker(tree, filename)
w.messages.sort(lambda a, b: cmp(a.lineno, b.lineno))
return [Message(x.filename, x.lineno, x.message%x.message_args) for x in
w.messages]
class Check(Command): class Check(Command):
description = 'Check for errors in the calibre source code' description = 'Check for errors in the calibre source code'
BUILTINS = ['_', '__', 'dynamic_property', 'I', 'P', 'lopen', 'icu_lower',
'icu_upper', 'icu_title', 'ngettext']
CACHE = '.check-cache.pickle' CACHE = '.check-cache.pickle'
def get_files(self, cache): def get_files(self, cache):
@ -65,8 +37,8 @@ class Check(Command):
mtime = os.stat(y).st_mtime mtime = os.stat(y).st_mtime
if cache.get(y, 0) == mtime: if cache.get(y, 0) == mtime:
continue continue
if (f.endswith('.py') and f not in ('feedparser.py', if (f.endswith('.py') and f not in (
'pyparsing.py', 'markdown.py') and 'feedparser.py', 'markdown.py') and
'prs500/driver.py' not in y): 'prs500/driver.py' not in y):
yield y, mtime yield y, mtime
if f.endswith('.coffee'): if f.endswith('.coffee'):
@ -79,21 +51,18 @@ class Check(Command):
if f.endswith('.recipe') and cache.get(f, 0) != mtime: if f.endswith('.recipe') and cache.get(f, 0) != mtime:
yield f, mtime yield f, mtime
def run(self, opts): def run(self, opts):
cache = {} cache = {}
if os.path.exists(self.CACHE): if os.path.exists(self.CACHE):
cache = cPickle.load(open(self.CACHE, 'rb')) cache = cPickle.load(open(self.CACHE, 'rb'))
builtins = list(set_builtins(self.BUILTINS))
for f, mtime in self.get_files(cache): for f, mtime in self.get_files(cache):
self.info('\tChecking', f) self.info('\tChecking', f)
errors = False errors = False
ext = os.path.splitext(f)[1] ext = os.path.splitext(f)[1]
if ext in {'.py', '.recipe'}: if ext in {'.py', '.recipe'}:
w = check_for_python_errors(open(f, 'rb').read(), f) p = subprocess.Popen(['flake8', '--ignore=E,W', f])
if w: if p.wait() != 0:
errors = True errors = True
self.report_errors(w)
else: else:
from calibre.utils.serve_coffee import check_coffeescript from calibre.utils.serve_coffee import check_coffeescript
try: try:
@ -106,8 +75,6 @@ class Check(Command):
self.j(self.SRC, '../session.vim'), '-f', f]) self.j(self.SRC, '../session.vim'), '-f', f])
raise SystemExit(1) raise SystemExit(1)
cache[f] = mtime cache[f] = mtime
for x in builtins:
delattr(__builtin__, x)
cPickle.dump(cache, open(self.CACHE, 'wb'), -1) cPickle.dump(cache, open(self.CACHE, 'wb'), -1)
wn_path = os.path.expanduser('~/work/servers/src/calibre_servers/main') wn_path = os.path.expanduser('~/work/servers/src/calibre_servers/main')
if os.path.exists(wn_path): if os.path.exists(wn_path):

File diff suppressed because it is too large Load Diff

View File

@ -10253,7 +10253,7 @@ msgstr ""
#. name for inh #. name for inh
msgid "Ingush" msgid "Ingush"
msgstr "Engelsk" msgstr "Ingush"
#. name for inj #. name for inj
msgid "Inga; Jungle" msgid "Inga; Jungle"

View File

@ -4,7 +4,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
__appname__ = u'calibre' __appname__ = u'calibre'
numeric_version = (0, 9, 25) numeric_version = (0, 9, 27)
__version__ = u'.'.join(map(unicode, numeric_version)) __version__ = u'.'.join(map(unicode, numeric_version))
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>" __author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"

View File

@ -757,9 +757,10 @@ from calibre.ebooks.metadata.sources.isbndb import ISBNDB
from calibre.ebooks.metadata.sources.overdrive import OverDrive from calibre.ebooks.metadata.sources.overdrive import OverDrive
from calibre.ebooks.metadata.sources.douban import Douban from calibre.ebooks.metadata.sources.douban import Douban
from calibre.ebooks.metadata.sources.ozon import Ozon from calibre.ebooks.metadata.sources.ozon import Ozon
# from calibre.ebooks.metadata.sources.google_images import GoogleImages from calibre.ebooks.metadata.sources.google_images import GoogleImages
from calibre.ebooks.metadata.sources.big_book_search import BigBookSearch
plugins += [GoogleBooks, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon] plugins += [GoogleBooks, GoogleImages, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon, BigBookSearch]
# }}} # }}}
@ -1467,6 +1468,17 @@ class StoreKoboStore(StoreBase):
formats = ['EPUB'] formats = ['EPUB']
affiliate = True affiliate = True
class StoreKoobeStore(StoreBase):
name = 'Koobe'
author = u'Tomasz Długosz'
description = u'Księgarnia internetowa oferuje ebooki (książki elektroniczne) w postaci plików epub, mobi i pdf.'
actual_plugin = 'calibre.gui2.store.stores.koobe_plugin:KoobeStore'
drm_free_only = True
headquarters = 'PL'
formats = ['EPUB', 'MOBI', 'PDF']
affiliate = True
class StoreLegimiStore(StoreBase): class StoreLegimiStore(StoreBase):
name = 'Legimi' name = 'Legimi'
author = u'Tomasz Długosz' author = u'Tomasz Długosz'
@ -1649,6 +1661,7 @@ class StoreWoblinkStore(StoreBase):
headquarters = 'PL' headquarters = 'PL'
formats = ['EPUB', 'MOBI', 'PDF', 'WOBLINK'] formats = ['EPUB', 'MOBI', 'PDF', 'WOBLINK']
affiliate = True
class XinXiiStore(StoreBase): class XinXiiStore(StoreBase):
name = 'XinXii' name = 'XinXii'
@ -1686,6 +1699,7 @@ plugins += [
StoreGoogleBooksStore, StoreGoogleBooksStore,
StoreGutenbergStore, StoreGutenbergStore,
StoreKoboStore, StoreKoboStore,
StoreKoobeStore,
StoreLegimiStore, StoreLegimiStore,
StoreLibreDEStore, StoreLibreDEStore,
StoreLitResStore, StoreLitResStore,

View File

@ -91,7 +91,7 @@ def restore_plugin_state_to_default(plugin_or_name):
config['enabled_plugins'] = ep config['enabled_plugins'] = ep
default_disabled_plugins = set([ default_disabled_plugins = set([
'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images', 'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images', 'Big Book Search',
]) ])
def is_disabled(plugin): def is_disabled(plugin):

View File

@ -68,4 +68,5 @@ Various things that require other things before they can be migrated:
libraries/switching/on calibre startup. libraries/switching/on calibre startup.
3. From refresh in the legacy interface: Rember to flush the composite 3. From refresh in the legacy interface: Rember to flush the composite
column template cache. column template cache.
4. Replace the metadatabackup thread with the new implementation when using the new backend.
''' '''

View File

@ -41,7 +41,6 @@ Differences in semantics from pysqlite:
''' '''
class DynamicFilter(object): # {{{ class DynamicFilter(object): # {{{
'No longer used, present for legacy compatibility' 'No longer used, present for legacy compatibility'
@ -114,9 +113,10 @@ class DBPrefs(dict): # {{{
return default return default
def set_namespaced(self, namespace, key, val): def set_namespaced(self, namespace, key, val):
if u':' in key: raise KeyError('Colons are not allowed in keys') if u':' in key:
if u':' in namespace: raise KeyError('Colons are not allowed in' raise KeyError('Colons are not allowed in keys')
' the namespace') if u':' in namespace:
raise KeyError('Colons are not allowed in the namespace')
key = u'namespaced:%s:%s'%(namespace, key) key = u'namespaced:%s:%s'%(namespace, key)
self[key] = val self[key] = val
@ -170,7 +170,8 @@ def pynocase(one, two, encoding='utf-8'):
return cmp(one.lower(), two.lower()) return cmp(one.lower(), two.lower())
def _author_to_author_sort(x): def _author_to_author_sort(x):
if not x: return '' if not x:
return ''
return author_to_author_sort(x.replace('|', ',')) return author_to_author_sort(x.replace('|', ','))
def icu_collator(s1, s2): def icu_collator(s1, s2):
@ -305,7 +306,8 @@ class DB(object):
# Initialize database {{{ # Initialize database {{{
def __init__(self, library_path, default_prefs=None, read_only=False): def __init__(self, library_path, default_prefs=None, read_only=False,
restore_all_prefs=False, progress_callback=lambda x, y:True):
try: try:
if isbytestring(library_path): if isbytestring(library_path):
library_path = library_path.decode(filesystem_encoding) library_path = library_path.decode(filesystem_encoding)
@ -376,23 +378,27 @@ class DB(object):
UPDATE authors SET sort=author_to_author_sort(name) WHERE sort IS NULL; UPDATE authors SET sort=author_to_author_sort(name) WHERE sort IS NULL;
''') ''')
self.initialize_prefs(default_prefs) self.initialize_prefs(default_prefs, restore_all_prefs, progress_callback)
self.initialize_custom_columns() self.initialize_custom_columns()
self.initialize_tables() self.initialize_tables()
def initialize_prefs(self, default_prefs): # {{{ def initialize_prefs(self, default_prefs, restore_all_prefs, progress_callback): # {{{
self.prefs = DBPrefs(self) self.prefs = DBPrefs(self)
if default_prefs is not None and not self._exists: if default_prefs is not None and not self._exists:
progress_callback(None, len(default_prefs))
# Only apply default prefs to a new database # Only apply default prefs to a new database
for key in default_prefs: for i, key in enumerate(default_prefs):
# be sure that prefs not to be copied are listed below # be sure that prefs not to be copied are listed below
if key not in frozenset(['news_to_be_synced']): if restore_all_prefs or key not in frozenset(['news_to_be_synced']):
self.prefs[key] = default_prefs[key] self.prefs[key] = default_prefs[key]
progress_callback(_('restored preference ') + key, i+1)
if 'field_metadata' in default_prefs: if 'field_metadata' in default_prefs:
fmvals = [f for f in default_prefs['field_metadata'].values() fmvals = [f for f in default_prefs['field_metadata'].values()
if f['is_custom']] if f['is_custom']]
for f in fmvals: progress_callback(None, len(fmvals))
for i, f in enumerate(fmvals):
progress_callback(_('creating custom column ') + f['label'], i)
self.create_custom_column(f['label'], f['name'], self.create_custom_column(f['label'], f['name'],
f['datatype'], f['datatype'],
(f['is_multiple'] is not None and (f['is_multiple'] is not None and
@ -421,6 +427,8 @@ class DB(object):
('uuid', False), ('comments', True), ('id', False), ('pubdate', False), ('uuid', False), ('comments', True), ('id', False), ('pubdate', False),
('last_modified', False), ('size', False), ('languages', False), ('last_modified', False), ('size', False), ('languages', False),
] ]
defs['virtual_libraries'] = {}
defs['virtual_lib_on_startup'] = defs['cs_virtual_lib_on_startup'] = ''
# Migrate the bool tristate tweak # Migrate the bool tristate tweak
defs['bools_are_tristate'] = \ defs['bools_are_tristate'] = \
@ -469,6 +477,24 @@ class DB(object):
except: except:
pass pass
# migrate the gui_restriction preference to a virtual library
gr_pref = self.prefs.get('gui_restriction', None)
if gr_pref:
virt_libs = self.prefs.get('virtual_libraries', {})
virt_libs[gr_pref] = 'search:"' + gr_pref + '"'
self.prefs['virtual_libraries'] = virt_libs
self.prefs['gui_restriction'] = ''
self.prefs['virtual_lib_on_startup'] = gr_pref
# migrate the cs_restriction preference to a virtual library
gr_pref = self.prefs.get('cs_restriction', None)
if gr_pref:
virt_libs = self.prefs.get('virtual_libraries', {})
virt_libs[gr_pref] = 'search:"' + gr_pref + '"'
self.prefs['virtual_libraries'] = virt_libs
self.prefs['cs_restriction'] = ''
self.prefs['cs_virtual_lib_on_startup'] = gr_pref
# Rename any user categories with names that differ only in case # Rename any user categories with names that differ only in case
user_cats = self.prefs.get('user_categories', []) user_cats = self.prefs.get('user_categories', [])
catmap = {} catmap = {}
@ -690,11 +716,13 @@ class DB(object):
tables['size'] = SizeTable('size', self.field_metadata['size'].copy()) tables['size'] = SizeTable('size', self.field_metadata['size'].copy())
self.FIELD_MAP = {'id':0, 'title':1, 'authors':2, 'timestamp':3, self.FIELD_MAP = {
'size':4, 'rating':5, 'tags':6, 'comments':7, 'series':8, 'id':0, 'title':1, 'authors':2, 'timestamp':3, 'size':4,
'publisher':9, 'series_index':10, 'sort':11, 'author_sort':12, 'rating':5, 'tags':6, 'comments':7, 'series':8, 'publisher':9,
'formats':13, 'path':14, 'pubdate':15, 'uuid':16, 'cover':17, 'series_index':10, 'sort':11, 'author_sort':12, 'formats':13,
'au_map':18, 'last_modified':19, 'identifiers':20} 'path':14, 'pubdate':15, 'uuid':16, 'cover':17, 'au_map':18,
'last_modified':19, 'identifiers':20, 'languages':21,
}
for k,v in self.FIELD_MAP.iteritems(): for k,v in self.FIELD_MAP.iteritems():
self.field_metadata.set_field_record_index(k, v, prefer_custom=False) self.field_metadata.set_field_record_index(k, v, prefer_custom=False)
@ -740,6 +768,8 @@ class DB(object):
self.field_metadata.set_field_record_index('ondevice', base, prefer_custom=False) self.field_metadata.set_field_record_index('ondevice', base, prefer_custom=False)
self.FIELD_MAP['marked'] = base = base+1 self.FIELD_MAP['marked'] = base = base+1
self.field_metadata.set_field_record_index('marked', base, prefer_custom=False) self.field_metadata.set_field_record_index('marked', base, prefer_custom=False)
self.FIELD_MAP['series_sort'] = base = base+1
self.field_metadata.set_field_record_index('series_sort', base, prefer_custom=False)
# }}} # }}}
@ -753,6 +783,11 @@ class DB(object):
self._conn = Connection(self.dbpath) self._conn = Connection(self.dbpath)
return self._conn return self._conn
def close(self):
if self._conn is not None:
self._conn.close()
del self._conn
@dynamic_property @dynamic_property
def user_version(self): def user_version(self):
doc = 'The user version of this database' doc = 'The user version of this database'
@ -1067,5 +1102,15 @@ class DB(object):
break # Fail silently since nothing catastrophic has happened break # Fail silently since nothing catastrophic has happened
curpath = os.path.join(curpath, newseg) curpath = os.path.join(curpath, newseg)
def write_backup(self, path, raw):
path = os.path.abspath(os.path.join(self.library_path, path, 'metadata.opf'))
with lopen(path, 'wb') as f:
f.write(raw)
def read_backup(self, path):
path = os.path.abspath(os.path.join(self.library_path, path, 'metadata.opf'))
with lopen(path, 'rb') as f:
return f.read()
# }}} # }}}

115
src/calibre/db/backup.py Normal file
View File

@ -0,0 +1,115 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import weakref, traceback
from threading import Thread, Event
from calibre import prints
from calibre.ebooks.metadata.opf2 import metadata_to_opf
class Abort(Exception):
pass
class MetadataBackup(Thread):
'''
Continuously backup changed metadata into OPF files
in the book directory. This class runs in its own
thread.
'''
def __init__(self, db, interval=2, scheduling_interval=0.1):
Thread.__init__(self)
self.daemon = True
self._db = weakref.ref(db)
self.stop_running = Event()
self.interval = interval
self.scheduling_interval = scheduling_interval
@property
def db(self):
ans = self._db()
if ans is None:
raise Abort()
return ans
def stop(self):
self.stop_running.set()
def wait(self, interval):
if self.stop_running.wait(interval):
raise Abort()
def run(self):
while not self.stop_running.is_set():
try:
self.wait(self.interval)
self.do_one()
except Abort:
break
def do_one(self):
try:
book_id = self.db.get_a_dirtied_book()
if book_id is None:
return
except Abort:
raise
except:
# Happens during interpreter shutdown
return
self.wait(0)
try:
mi, sequence = self.db.get_metadata_for_dump(book_id)
except:
prints('Failed to get backup metadata for id:', book_id, 'once')
traceback.print_exc()
self.wait(self.interval)
try:
mi, sequence = self.db.get_metadata_for_dump(book_id)
except:
prints('Failed to get backup metadata for id:', book_id, 'again, giving up')
traceback.print_exc()
return
if mi is None:
self.db.clear_dirtied(book_id, sequence)
# Give the GUI thread a chance to do something. Python threads don't
# have priorities, so this thread would naturally keep the processor
# until some scheduling event happens. The wait makes such an event
self.wait(self.scheduling_interval)
try:
raw = metadata_to_opf(mi)
except:
prints('Failed to convert to opf for id:', book_id)
traceback.print_exc()
return
self.wait(self.scheduling_interval)
try:
self.db.write_backup(book_id, raw)
except:
prints('Failed to write backup metadata for id:', book_id, 'once')
self.wait(self.interval)
try:
self.db.write_backup(book_id, raw)
except:
prints('Failed to write backup metadata for id:', book_id, 'again, giving up')
return
self.db.clear_dirtied(book_id, sequence)
def break_cycles(self):
# Legacy compatibility
pass

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, traceback import os, traceback, random
from io import BytesIO from io import BytesIO
from collections import defaultdict from collections import defaultdict
from functools import wraps, partial from functools import wraps, partial
@ -15,7 +15,7 @@ from functools import wraps, partial
from calibre.constants import iswindows from calibre.constants import iswindows
from calibre.db import SPOOL_SIZE from calibre.db import SPOOL_SIZE
from calibre.db.categories import get_categories from calibre.db.categories import get_categories
from calibre.db.locking import create_locks, RecordLock from calibre.db.locking import create_locks
from calibre.db.errors import NoSuchFormat from calibre.db.errors import NoSuchFormat
from calibre.db.fields import create_field from calibre.db.fields import create_field
from calibre.db.search import Search from calibre.db.search import Search
@ -23,9 +23,10 @@ from calibre.db.tables import VirtualTable
from calibre.db.write import get_series_values from calibre.db.write import get_series_values
from calibre.db.lazy import FormatMetadata, FormatsList from calibre.db.lazy import FormatMetadata, FormatsList
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.opf2 import metadata_to_opf
from calibre.ptempfile import (base_dir, PersistentTemporaryFile, from calibre.ptempfile import (base_dir, PersistentTemporaryFile,
SpooledTemporaryFile) SpooledTemporaryFile)
from calibre.utils.date import now from calibre.utils.date import now as nowf
from calibre.utils.icu import sort_key from calibre.utils.icu import sort_key
def api(f): def api(f):
@ -57,9 +58,10 @@ class Cache(object):
self.fields = {} self.fields = {}
self.composites = set() self.composites = set()
self.read_lock, self.write_lock = create_locks() self.read_lock, self.write_lock = create_locks()
self.record_lock = RecordLock(self.read_lock)
self.format_metadata_cache = defaultdict(dict) self.format_metadata_cache = defaultdict(dict)
self.formatter_template_cache = {} self.formatter_template_cache = {}
self.dirtied_cache = {}
self.dirtied_sequence = 0
self._search_api = Search(self.field_metadata.get_search_terms()) self._search_api = Search(self.field_metadata.get_search_terms())
# Implement locking for all simple read/write API methods # Implement locking for all simple read/write API methods
@ -78,17 +80,18 @@ class Cache(object):
self.initialize_dynamic() self.initialize_dynamic()
@write_api
def initialize_dynamic(self): def initialize_dynamic(self):
# Reconstruct the user categories, putting them into field_metadata # Reconstruct the user categories, putting them into field_metadata
# Assumption is that someone else will fix them if they change. # Assumption is that someone else will fix them if they change.
self.field_metadata.remove_dynamic_categories() self.field_metadata.remove_dynamic_categories()
for user_cat in sorted(self.pref('user_categories', {}).iterkeys(), key=sort_key): for user_cat in sorted(self._pref('user_categories', {}).iterkeys(), key=sort_key):
cat_name = '@' + user_cat # add the '@' to avoid name collision cat_name = '@' + user_cat # add the '@' to avoid name collision
self.field_metadata.add_user_category(label=cat_name, name=user_cat) self.field_metadata.add_user_category(label=cat_name, name=user_cat)
# add grouped search term user categories # add grouped search term user categories
muc = frozenset(self.pref('grouped_search_make_user_categories', [])) muc = frozenset(self._pref('grouped_search_make_user_categories', []))
for cat in sorted(self.pref('grouped_search_terms', {}).iterkeys(), key=sort_key): for cat in sorted(self._pref('grouped_search_terms', {}).iterkeys(), key=sort_key):
if cat in muc: if cat in muc:
# There is a chance that these can be duplicates of an existing # There is a chance that these can be duplicates of an existing
# user category. Print the exception and continue. # user category. Print the exception and continue.
@ -102,10 +105,15 @@ class Cache(object):
# self.field_metadata.add_search_category(label='search', name=_('Searches')) # self.field_metadata.add_search_category(label='search', name=_('Searches'))
self.field_metadata.add_grouped_search_terms( self.field_metadata.add_grouped_search_terms(
self.pref('grouped_search_terms', {})) self._pref('grouped_search_terms', {}))
self._search_api.change_locations(self.field_metadata.get_search_terms()) self._search_api.change_locations(self.field_metadata.get_search_terms())
self.dirtied_cache = {x:i for i, (x,) in enumerate(
self.backend.conn.execute('SELECT book FROM metadata_dirtied'))}
if self.dirtied_cache:
self.dirtied_sequence = max(self.dirtied_cache.itervalues())+1
@property @property
def field_metadata(self): def field_metadata(self):
return self.backend.field_metadata return self.backend.field_metadata
@ -131,7 +139,7 @@ class Cache(object):
mi.author_link_map = aul mi.author_link_map = aul
mi.comments = self._field_for('comments', book_id) mi.comments = self._field_for('comments', book_id)
mi.publisher = self._field_for('publisher', book_id) mi.publisher = self._field_for('publisher', book_id)
n = now() n = nowf()
mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.timestamp = self._field_for('timestamp', book_id, default_value=n)
mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n)
mi.uuid = self._field_for('uuid', book_id, mi.uuid = self._field_for('uuid', book_id,
@ -395,16 +403,19 @@ class Cache(object):
''' '''
if as_file: if as_file:
ret = SpooledTemporaryFile(SPOOL_SIZE) ret = SpooledTemporaryFile(SPOOL_SIZE)
if not self.copy_cover_to(book_id, ret): return if not self.copy_cover_to(book_id, ret):
return
ret.seek(0) ret.seek(0)
elif as_path: elif as_path:
pt = PersistentTemporaryFile('_dbcover.jpg') pt = PersistentTemporaryFile('_dbcover.jpg')
with pt: with pt:
if not self.copy_cover_to(book_id, pt): return if not self.copy_cover_to(book_id, pt):
return
ret = pt.name ret = pt.name
else: else:
buf = BytesIO() buf = BytesIO()
if not self.copy_cover_to(book_id, buf): return if not self.copy_cover_to(book_id, buf):
return
ret = buf.getvalue() ret = buf.getvalue()
if as_image: if as_image:
from PyQt4.Qt import QImage from PyQt4.Qt import QImage
@ -413,7 +424,7 @@ class Cache(object):
ret = i ret = i
return ret return ret
@api @read_api
def copy_cover_to(self, book_id, dest, use_hardlink=False): def copy_cover_to(self, book_id, dest, use_hardlink=False):
''' '''
Copy the cover to the file like object ``dest``. Returns False Copy the cover to the file like object ``dest``. Returns False
@ -422,17 +433,15 @@ class Cache(object):
copied to it iff the path is different from the current path (taking copied to it iff the path is different from the current path (taking
case sensitivity into account). case sensitivity into account).
''' '''
with self.read_lock:
try: try:
path = self._field_for('path', book_id).replace('/', os.sep) path = self._field_for('path', book_id).replace('/', os.sep)
except: except AttributeError:
return False return False
with self.record_lock.lock(book_id):
return self.backend.copy_cover_to(path, dest, return self.backend.copy_cover_to(path, dest,
use_hardlink=use_hardlink) use_hardlink=use_hardlink)
@api @read_api
def copy_format_to(self, book_id, fmt, dest, use_hardlink=False): def copy_format_to(self, book_id, fmt, dest, use_hardlink=False):
''' '''
Copy the format ``fmt`` to the file like object ``dest``. If the Copy the format ``fmt`` to the file like object ``dest``. If the
@ -441,14 +450,12 @@ class Cache(object):
the path is different from the current path (taking case sensitivity the path is different from the current path (taking case sensitivity
into account). into account).
''' '''
with self.read_lock:
try: try:
name = self.fields['formats'].format_fname(book_id, fmt) name = self.fields['formats'].format_fname(book_id, fmt)
path = self._field_for('path', book_id).replace('/', os.sep) path = self._field_for('path', book_id).replace('/', os.sep)
except: except (KeyError, AttributeError):
raise NoSuchFormat('Record %d has no %s file'%(book_id, fmt)) raise NoSuchFormat('Record %d has no %s file'%(book_id, fmt))
with self.record_lock.lock(book_id):
return self.backend.copy_format_to(book_id, fmt, name, path, dest, return self.backend.copy_format_to(book_id, fmt, name, path, dest,
use_hardlink=use_hardlink) use_hardlink=use_hardlink)
@ -520,16 +527,16 @@ class Cache(object):
this means that repeated calls yield the same this means that repeated calls yield the same
temp file (which is re-created each time) temp file (which is re-created each time)
''' '''
with self.read_lock:
ext = ('.'+fmt.lower()) if fmt else '' ext = ('.'+fmt.lower()) if fmt else ''
if as_path:
if preserve_filename:
with self.read_lock:
try: try:
fname = self.fields['formats'].format_fname(book_id, fmt) fname = self.fields['formats'].format_fname(book_id, fmt)
except: except:
return None return None
fname += ext fname += ext
if as_path:
if preserve_filename:
bd = base_dir() bd = base_dir()
d = os.path.join(bd, 'format_abspath') d = os.path.join(bd, 'format_abspath')
try: try:
@ -537,21 +544,26 @@ class Cache(object):
except: except:
pass pass
ret = os.path.join(d, fname) ret = os.path.join(d, fname)
with self.record_lock.lock(book_id):
try: try:
self.copy_format_to(book_id, fmt, ret) self.copy_format_to(book_id, fmt, ret)
except NoSuchFormat: except NoSuchFormat:
return None return None
else: else:
with PersistentTemporaryFile(ext) as pt, self.record_lock.lock(book_id): with PersistentTemporaryFile(ext) as pt:
try: try:
self.copy_format_to(book_id, fmt, pt) self.copy_format_to(book_id, fmt, pt)
except NoSuchFormat: except NoSuchFormat:
return None return None
ret = pt.name ret = pt.name
elif as_file: elif as_file:
with self.read_lock:
try:
fname = self.fields['formats'].format_fname(book_id, fmt)
except:
return None
fname += ext
ret = SpooledTemporaryFile(SPOOL_SIZE) ret = SpooledTemporaryFile(SPOOL_SIZE)
with self.record_lock.lock(book_id):
try: try:
self.copy_format_to(book_id, fmt, ret) self.copy_format_to(book_id, fmt, ret)
except NoSuchFormat: except NoSuchFormat:
@ -562,7 +574,6 @@ class Cache(object):
ret.name = fname ret.name = fname
else: else:
buf = BytesIO() buf = BytesIO()
with self.record_lock.lock(book_id):
try: try:
self.copy_format_to(book_id, fmt, buf) self.copy_format_to(book_id, fmt, buf)
except NoSuchFormat: except NoSuchFormat:
@ -620,6 +631,30 @@ class Cache(object):
return get_categories(self, sort=sort, book_ids=book_ids, return get_categories(self, sort=sort, book_ids=book_ids,
icon_map=icon_map) icon_map=icon_map)
@write_api
def update_last_modified(self, book_ids, now=None):
if now is None:
now = nowf()
if book_ids:
f = self.fields['last_modified']
f.writer.set_books({book_id:now for book_id in book_ids}, self.backend)
@write_api
def mark_as_dirty(self, book_ids):
self._update_last_modified(book_ids)
already_dirtied = set(self.dirtied_cache).intersection(book_ids)
new_dirtied = book_ids - already_dirtied
already_dirtied = {book_id:self.dirtied_sequence+i for i, book_id in enumerate(already_dirtied)}
if already_dirtied:
self.dirtied_sequence = max(already_dirtied.itervalues()) + 1
self.dirtied_cache.update(already_dirtied)
if new_dirtied:
self.backend.conn.executemany('INSERT OR IGNORE INTO metadata_dirtied (book) VALUES (?)',
((x,) for x in new_dirtied))
new_dirtied = {book_id:self.dirtied_sequence+i for i, book_id in enumerate(new_dirtied)}
self.dirtied_sequence = max(new_dirtied.itervalues()) + 1
self.dirtied_cache.update(new_dirtied)
@write_api @write_api
def set_field(self, name, book_id_to_val_map, allow_case_change=True): def set_field(self, name, book_id_to_val_map, allow_case_change=True):
f = self.fields[name] f = self.fields[name]
@ -657,7 +692,7 @@ class Cache(object):
if dirtied and update_path: if dirtied and update_path:
self._update_path(dirtied, mark_as_dirtied=False) self._update_path(dirtied, mark_as_dirtied=False)
# TODO: Mark these as dirtied so that the opf is regenerated self._mark_as_dirty(dirtied)
return dirtied return dirtied
@ -668,9 +703,111 @@ class Cache(object):
author = self._field_for('authors', book_id, default_value=(_('Unknown'),))[0] author = self._field_for('authors', book_id, default_value=(_('Unknown'),))[0]
self.backend.update_path(book_id, title, author, self.fields['path'], self.fields['formats']) self.backend.update_path(book_id, title, author, self.fields['path'], self.fields['formats'])
if mark_as_dirtied: if mark_as_dirtied:
self._mark_as_dirty(book_ids)
@read_api
def get_a_dirtied_book(self):
if self.dirtied_cache:
return random.choice(tuple(self.dirtied_cache.iterkeys()))
return None
@read_api
def get_metadata_for_dump(self, book_id):
mi = None
# get the current sequence number for this book to pass back to the
# backup thread. This will avoid double calls in the case where the
# thread has not done the work between the put and the get_metadata
sequence = self.dirtied_cache.get(book_id, None)
if sequence is not None:
try:
# While a book is being created, the path is empty. Don't bother to
# try to write the opf, because it will go to the wrong folder.
if self._field_for('path', book_id):
mi = self._get_metadata(book_id)
# Always set cover to cover.jpg. Even if cover doesn't exist,
# no harm done. This way no need to call dirtied when
# cover is set/removed
mi.cover = 'cover.jpg'
except:
# This almost certainly means that the book has been deleted while
# the backup operation sat in the queue.
pass pass
# TODO: Mark these books as dirtied so that metadata.opf is return mi, sequence
# re-created
@write_api
def clear_dirtied(self, book_id, sequence):
'''
Clear the dirtied indicator for the books. This is used when fetching
metadata, creating an OPF, and writing a file are separated into steps.
The last step is clearing the indicator
'''
dc_sequence = self.dirtied_cache.get(book_id, None)
if dc_sequence is None or sequence is None or dc_sequence == sequence:
self.backend.conn.execute('DELETE FROM metadata_dirtied WHERE book=?',
(book_id,))
self.dirtied_cache.pop(book_id, None)
@write_api
def write_backup(self, book_id, raw):
try:
path = self._field_for('path', book_id).replace('/', os.sep)
except:
return
self.backend.write_backup(path, raw)
@read_api
def dirty_queue_length(self):
return len(self.dirtied_cache)
@read_api
def read_backup(self, book_id):
''' Return the OPF metadata backup for the book as a bytestring or None
if no such backup exists. '''
try:
path = self._field_for('path', book_id).replace('/', os.sep)
except:
return
try:
return self.backend.read_backup(path)
except EnvironmentError:
return None
@write_api
def dump_metadata(self, book_ids=None, remove_from_dirtied=True,
callback=None):
'''
Write metadata for each record to an individual OPF file. If callback
is not None, it is called once at the start with the number of book_ids
being processed. And once for every book_id, with arguments (book_id,
mi, ok).
'''
if book_ids is None:
book_ids = set(self.dirtied_cache)
if callback is not None:
callback(len(book_ids), True, False)
for book_id in book_ids:
if self._field_for('path', book_id) is None:
if callback is not None:
callback(book_id, None, False)
continue
mi, sequence = self._get_metadata_for_dump(book_id)
if mi is None:
if callback is not None:
callback(book_id, mi, False)
continue
try:
raw = metadata_to_opf(mi)
self._write_backup(book_id, raw)
if remove_from_dirtied:
self._clear_dirtied(book_id, sequence)
except:
pass
if callback is not None:
callback(book_id, mi, True)
# }}} # }}}

94
src/calibre/db/legacy.py Normal file
View File

@ -0,0 +1,94 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os
from functools import partial
from calibre.db.backend import DB
from calibre.db.cache import Cache
from calibre.db.view import View
class LibraryDatabase(object):
''' Emulate the old LibraryDatabase2 interface '''
PATH_LIMIT = DB.PATH_LIMIT
WINDOWS_LIBRARY_PATH_LIMIT = DB.WINDOWS_LIBRARY_PATH_LIMIT
@classmethod
def exists_at(cls, path):
return path and os.path.exists(os.path.join(path, 'metadata.db'))
def __init__(self, library_path,
default_prefs=None, read_only=False, is_second_db=False,
progress_callback=lambda x, y:True, restore_all_prefs=False):
self.is_second_db = is_second_db # TODO: Use is_second_db
backend = self.backend = DB(library_path, default_prefs=default_prefs,
read_only=read_only, restore_all_prefs=restore_all_prefs,
progress_callback=progress_callback)
cache = self.new_api = Cache(backend)
cache.init()
self.data = View(cache)
self.get_property = self.data.get_property
for prop in (
'author_sort', 'authors', 'comment', 'comments',
'publisher', 'rating', 'series', 'series_index', 'tags',
'title', 'timestamp', 'uuid', 'pubdate', 'ondevice',
'metadata_last_modified', 'languages',
):
fm = {'comment':'comments', 'metadata_last_modified':
'last_modified', 'title_sort':'sort'}.get(prop, prop)
setattr(self, prop, partial(self.get_property,
loc=self.FIELD_MAP[fm]))
def close(self):
self.backend.close()
def break_cycles(self):
self.data.cache.backend = None
self.data.cache = None
self.data = self.backend = self.new_api = self.field_metadata = self.prefs = self.listeners = self.refresh_ondevice = None
# Library wide properties {{{
@property
def field_metadata(self):
return self.backend.field_metadata
@property
def user_version(self):
return self.backend.user_version
@property
def library_id(self):
return self.backend.library_id
def last_modified(self):
return self.backend.last_modified()
@property
def custom_column_num_map(self):
return self.backend.custom_column_num_map
@property
def custom_column_label_map(self):
return self.backend.custom_column_label_map
@property
def FIELD_MAP(self):
return self.backend.FIELD_MAP
def all_ids(self):
for book_id in self.data.cache.all_book_ids():
yield book_id
# }}}

View File

@ -191,7 +191,7 @@ class SHLock(object): # {{{
try: try:
return self._free_waiters.pop() return self._free_waiters.pop()
except IndexError: except IndexError:
return Condition(self._lock)#, verbose=True) return Condition(self._lock)
def _return_waiter(self, waiter): def _return_waiter(self, waiter):
self._free_waiters.append(waiter) self._free_waiters.append(waiter)

View File

@ -172,7 +172,6 @@ class SchemaUpgrade(object):
''' '''
) )
def upgrade_version_6(self): def upgrade_version_6(self):
'Show authors in order' 'Show authors in order'
self.conn.execute(''' self.conn.execute('''

View File

@ -64,7 +64,7 @@ def _match(query, value, matchkind, use_primary_find_in_search=True):
else: else:
internal_match_ok = False internal_match_ok = False
for t in value: for t in value:
try: ### ignore regexp exceptions, required because search-ahead tries before typing is finished try: # ignore regexp exceptions, required because search-ahead tries before typing is finished
t = icu_lower(t) t = icu_lower(t)
if (matchkind == EQUALS_MATCH): if (matchkind == EQUALS_MATCH):
if internal_match_ok: if internal_match_ok:
@ -195,13 +195,13 @@ class DateSearch(object): # {{{
try: try:
qd = now() - timedelta(int(num)) qd = now() - timedelta(int(num))
except: except:
raise ParseException(query, len(query), 'Number conversion error') raise ParseException(_('Number conversion error: {0}').format(num))
field_count = 3 field_count = 3
else: else:
try: try:
qd = parse_date(query, as_utc=False) qd = parse_date(query, as_utc=False)
except: except:
raise ParseException(query, len(query), 'Date conversion error') raise ParseException(_('Date conversion error: {0}').format(query))
if '-' in query: if '-' in query:
field_count = query.count('-') + 1 field_count = query.count('-') + 1
else: else:
@ -285,8 +285,8 @@ class NumericSearch(object): # {{{
try: try:
q = cast(query) * mult q = cast(query) * mult
except: except:
raise ParseException(query, len(query), raise ParseException(
'Non-numeric value in query: %r'%query) _('Non-numeric value in query: {0}').format(query))
for val, book_ids in field_iter(): for val, book_ids in field_iter():
if val is None: if val is None:
@ -351,8 +351,8 @@ class KeyPairSearch(object): # {{{
if ':' in query: if ':' in query:
q = [q.strip() for q in query.split(':')] q = [q.strip() for q in query.split(':')]
if len(q) != 2: if len(q) != 2:
raise ParseException(query, len(query), raise ParseException(
'Invalid query format for colon-separated search') _('Invalid query format for colon-separated search: {0}').format(query))
keyq, valq = q keyq, valq = q
keyq_mkind, keyq = _matchkind(keyq) keyq_mkind, keyq = _matchkind(keyq)
valq_mkind, valq = _matchkind(valq) valq_mkind, valq = _matchkind(valq)
@ -465,7 +465,8 @@ class Parser(SearchQueryParser):
if invert: if invert:
matches = self.all_book_ids - matches matches = self.all_book_ids - matches
return matches return matches
raise ParseException(query, len(query), 'Recursive query group detected') raise ParseException(
_('Recursive query group detected: {0}').format(query))
# If the user has asked to restrict searching over all field, apply # If the user has asked to restrict searching over all field, apply
# that restriction # that restriction
@ -547,7 +548,8 @@ class Parser(SearchQueryParser):
field_metadata = {} field_metadata = {}
for x, fm in self.field_metadata.iteritems(): for x, fm in self.field_metadata.iteritems():
if x.startswith('@'): continue if x.startswith('@'):
continue
if fm['search_terms'] and x != 'series_sort': if fm['search_terms'] and x != 'series_sort':
all_locs.add(x) all_locs.add(x)
field_metadata[x] = fm field_metadata[x] = fm

View File

@ -16,6 +16,9 @@ rmtree = partial(shutil.rmtree, ignore_errors=True)
class BaseTest(unittest.TestCase): class BaseTest(unittest.TestCase):
longMessage = True
maxDiff = None
def setUp(self): def setUp(self):
self.library_path = self.mkdtemp() self.library_path = self.mkdtemp()
self.create_db(self.library_path) self.create_db(self.library_path)
@ -40,10 +43,10 @@ class BaseTest(unittest.TestCase):
db.conn.close() db.conn.close()
return dest return dest
def init_cache(self, library_path): def init_cache(self, library_path=None):
from calibre.db.backend import DB from calibre.db.backend import DB
from calibre.db.cache import Cache from calibre.db.cache import Cache
backend = DB(library_path) backend = DB(library_path or self.library_path)
cache = Cache(backend) cache = Cache(backend)
cache.init() cache.init()
return cache return cache
@ -53,9 +56,13 @@ class BaseTest(unittest.TestCase):
atexit.register(rmtree, ans) atexit.register(rmtree, ans)
return ans return ans
def init_old(self, library_path): def init_old(self, library_path=None):
from calibre.library.database2 import LibraryDatabase2 from calibre.library.database2 import LibraryDatabase2
return LibraryDatabase2(library_path) return LibraryDatabase2(library_path or self.library_path)
def init_legacy(self, library_path=None):
from calibre.db.legacy import LibraryDatabase
return LibraryDatabase(library_path or self.library_path)
def clone_library(self, library_path): def clone_library(self, library_path):
if not hasattr(self, 'clone_dir'): if not hasattr(self, 'clone_dir'):
@ -81,7 +88,8 @@ class BaseTest(unittest.TestCase):
'ondevice_col', 'last_modified', 'has_cover', 'ondevice_col', 'last_modified', 'has_cover',
'cover_data'}.union(allfk1) 'cover_data'}.union(allfk1)
for attr in all_keys: for attr in all_keys:
if attr == 'user_metadata': continue if attr == 'user_metadata':
continue
attr1, attr2 = getattr(mi1, attr), getattr(mi2, attr) attr1, attr2 = getattr(mi1, attr), getattr(mi2, attr)
if attr == 'formats': if attr == 'formats':
attr1, attr2 = map(lambda x:tuple(x) if x else (), (attr1, attr2)) attr1, attr2 = map(lambda x:tuple(x) if x else (), (attr1, attr2))

View File

@ -0,0 +1,66 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.db.tests.base import BaseTest
class LegacyTest(BaseTest):
''' Test the emulation of the legacy interface. '''
def test_library_wide_properties(self): # {{{
'Test library wide properties'
def get_props(db):
props = ('user_version', 'is_second_db', 'library_id', 'field_metadata',
'custom_column_label_map', 'custom_column_num_map')
fprops = ('last_modified', )
ans = {x:getattr(db, x) for x in props}
ans.update({x:getattr(db, x)() for x in fprops})
ans['all_ids'] = frozenset(db.all_ids())
return ans
old = self.init_old()
oldvals = get_props(old)
old.close()
del old
db = self.init_legacy()
newvals = get_props(db)
self.assertEqual(oldvals, newvals)
db.close()
# }}}
def test_get_property(self): # {{{
'Test the get_property interface for reading data'
def get_values(db):
ans = {}
for label, loc in db.FIELD_MAP.iteritems():
if isinstance(label, int):
label = '#'+db.custom_column_num_map[label]['label']
label = type('')(label)
ans[label] = tuple(db.get_property(i, index_is_id=True, loc=loc)
for i in db.all_ids())
if label in ('id', 'title', '#tags'):
with self.assertRaises(IndexError):
db.get_property(9999, loc=loc)
with self.assertRaises(IndexError):
db.get_property(9999, index_is_id=True, loc=loc)
if label in {'tags', 'formats'}:
# Order is random in the old db for these
ans[label] = tuple(set(x.split(',')) if x else x for x in ans[label])
return ans
old = self.init_old()
old_vals = get_values(old)
old.close()
old = None
db = self.init_legacy()
new_vals = get_values(db)
db.close()
self.assertEqual(old_vals, new_vals)
# }}}

View File

@ -9,15 +9,32 @@ __docformat__ = 'restructuredtext en'
import unittest, os, argparse import unittest, os, argparse
try:
import init_calibre # noqa
except ImportError:
pass
def find_tests(): def find_tests():
return unittest.defaultTestLoader.discover(os.path.dirname(os.path.abspath(__file__)), pattern='*.py') return unittest.defaultTestLoader.discover(os.path.dirname(os.path.abspath(__file__)), pattern='*.py')
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('name', nargs='?', default=None, help='The name of the test to run, for e.g. writing.WritingTest.many_many_basic') parser.add_argument('name', nargs='?', default=None,
help='The name of the test to run, for e.g. writing.WritingTest.many_many_basic or .many_many_basic for a shortcut')
args = parser.parse_args() args = parser.parse_args()
if args.name: if args.name and args.name.startswith('.'):
unittest.TextTestRunner(verbosity=4).run(unittest.defaultTestLoader.loadTestsFromName(args.name)) tests = find_tests()
ans = None
try:
for suite in tests:
for test in suite._tests:
for s in test:
if s._testMethodName == args.name[1:]:
tests = s
raise StopIteration()
except StopIteration:
pass
else: else:
unittest.TextTestRunner(verbosity=4).run(find_tests()) tests = unittest.defaultTestLoader.loadTestsFromName(args.name) if args.name else find_tests()
unittest.TextTestRunner(verbosity=4).run(tests)

View File

@ -8,6 +8,7 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import datetime import datetime
from io import BytesIO
from calibre.utils.date import utc_tz from calibre.utils.date import utc_tz
from calibre.db.tests.base import BaseTest from calibre.db.tests.base import BaseTest
@ -205,6 +206,9 @@ class ReadingTest(BaseTest):
else: else:
self.assertEqual(cdata, cache.cover(book_id, as_path=True), self.assertEqual(cdata, cache.cover(book_id, as_path=True),
'Reading of null cover as path failed') 'Reading of null cover as path failed')
buf = BytesIO()
self.assertFalse(cache.copy_cover_to(99999, buf), 'copy_cover_to() did not return False for non-existent book_id')
self.assertFalse(cache.copy_cover_to(3, buf), 'copy_cover_to() did not return False for non-existent cover')
# }}} # }}}
@ -305,6 +309,7 @@ class ReadingTest(BaseTest):
def test_get_formats(self): # {{{ def test_get_formats(self): # {{{
'Test reading ebook formats using the format() method' 'Test reading ebook formats using the format() method'
from calibre.library.database2 import LibraryDatabase2 from calibre.library.database2 import LibraryDatabase2
from calibre.db.cache import NoSuchFormat
old = LibraryDatabase2(self.library_path) old = LibraryDatabase2(self.library_path)
ids = old.all_ids() ids = old.all_ids()
lf = {i:set(old.formats(i, index_is_id=True).split(',')) if old.formats( lf = {i:set(old.formats(i, index_is_id=True).split(',')) if old.formats(
@ -332,6 +337,9 @@ class ReadingTest(BaseTest):
self.assertEqual(old, f.read(), self.assertEqual(old, f.read(),
'Failed to read format as path') 'Failed to read format as path')
buf = BytesIO()
self.assertRaises(NoSuchFormat, cache.copy_format_to, 99999, 'X', buf, 'copy_format_to() failed to raise an exception for non-existent book')
self.assertRaises(NoSuchFormat, cache.copy_format_to, 1, 'X', buf, 'copy_format_to() failed to raise an exception for non-existent format')
# }}} # }}}

View File

@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
from collections import namedtuple from collections import namedtuple
from functools import partial from functools import partial
from io import BytesIO
from calibre.ebooks.metadata import author_to_author_sort from calibre.ebooks.metadata import author_to_author_sort
from calibre.utils.date import UNDEFINED_DATE from calibre.utils.date import UNDEFINED_DATE
@ -16,6 +17,7 @@ from calibre.db.tests.base import BaseTest
class WritingTest(BaseTest): class WritingTest(BaseTest):
# Utils {{{
def create_getter(self, name, getter=None): def create_getter(self, name, getter=None):
if getter is None: if getter is None:
if name.endswith('_index'): if name.endswith('_index'):
@ -70,6 +72,7 @@ class WritingTest(BaseTest):
'Failed setting for %s, sqlite value not the same: %r != %r'%( 'Failed setting for %s, sqlite value not the same: %r != %r'%(
test.name, old_sqlite_res, sqlite_res)) test.name, old_sqlite_res, sqlite_res))
del db del db
# }}}
def test_one_one(self): # {{{ def test_one_one(self): # {{{
'Test setting of values in one-one fields' 'Test setting of values in one-one fields'
@ -289,6 +292,67 @@ class WritingTest(BaseTest):
ae(c.field_for('sort', 1), 'Moose, The') ae(c.field_for('sort', 1), 'Moose, The')
ae(c.field_for('sort', 2), 'Cat') ae(c.field_for('sort', 2), 'Cat')
# }}} # }}}
def test_dirtied(self): # {{{
'Test the setting of the dirtied flag and the last_modified column'
cl = self.cloned_library
cache = self.init_cache(cl)
ae, af, sf = self.assertEqual, self.assertFalse, cache.set_field
# First empty dirtied
cache.dump_metadata()
af(cache.dirtied_cache)
af(self.init_cache(cl).dirtied_cache)
prev = cache.field_for('last_modified', 3)
import calibre.db.cache as c
from datetime import timedelta
utime = prev+timedelta(days=1)
onowf = c.nowf
c.nowf = lambda: utime
try:
ae(sf('title', {3:'xxx'}), set([3]))
self.assertTrue(3 in cache.dirtied_cache)
ae(cache.field_for('last_modified', 3), utime)
cache.dump_metadata()
raw = cache.read_backup(3)
from calibre.ebooks.metadata.opf2 import OPF
opf = OPF(BytesIO(raw))
ae(opf.title, 'xxx')
finally:
c.nowf = onowf
# }}}
def test_backup(self): # {{{
'Test the automatic backup of changed metadata'
cl = self.cloned_library
cache = self.init_cache(cl)
ae, af, sf, ff = self.assertEqual, self.assertFalse, cache.set_field, cache.field_for
# First empty dirtied
cache.dump_metadata()
af(cache.dirtied_cache)
from calibre.db.backup import MetadataBackup
interval = 0.01
mb = MetadataBackup(cache, interval=interval, scheduling_interval=0)
mb.start()
try:
ae(sf('title', {1:'title1', 2:'title2', 3:'title3'}), {1,2,3})
ae(sf('authors', {1:'author1 & author2', 2:'author1 & author2', 3:'author1 & author2'}), {1,2,3})
count = 6
while cache.dirty_queue_length() and count > 0:
mb.join(interval)
count -= 1
af(cache.dirty_queue_length())
finally:
mb.stop()
mb.join(interval)
af(mb.is_alive())
from calibre.ebooks.metadata.opf2 import OPF
for book_id in (1, 2, 3):
raw = cache.read_backup(book_id)
opf = OPF(BytesIO(raw))
ae(opf.title, 'title%d'%book_id)
ae(opf.authors, ['author1', 'author2'])
# }}}

View File

@ -11,6 +11,9 @@ import weakref
from functools import partial from functools import partial
from itertools import izip, imap from itertools import izip, imap
from calibre.ebooks.metadata import title_sort
from calibre.utils.config_base import tweaks
def sanitize_sort_field_name(field_metadata, field): def sanitize_sort_field_name(field_metadata, field):
field = field_metadata.search_term_to_field_key(field.lower().strip()) field = field_metadata.search_term_to_field_key(field.lower().strip())
# translate some fields to their hidden equivalent # translate some fields to their hidden equivalent
@ -40,6 +43,18 @@ class TableRow(list):
else: else:
return view._field_getters[obj](self.book_id) return view._field_getters[obj](self.book_id)
def format_is_multiple(x, sep=',', repl=None):
if not x:
return None
if repl is not None:
x = (y.replace(sep, repl) for y in x)
return sep.join(x)
def format_identifiers(x):
if not x:
return None
return ','.join('%s:%s'%(k, v) for k, v in x.iteritems())
class View(object): class View(object):
''' A table view of the database, with rows and columns. Also supports ''' A table view of the database, with rows and columns. Also supports
@ -49,33 +64,63 @@ class View(object):
self.cache = cache self.cache = cache
self.marked_ids = {} self.marked_ids = {}
self.search_restriction_book_count = 0 self.search_restriction_book_count = 0
self.search_restriction = '' self.search_restriction = self.base_restriction = ''
self.search_restriction_name = self.base_restriction_name = ''
self._field_getters = {} self._field_getters = {}
for col, idx in cache.backend.FIELD_MAP.iteritems(): for col, idx in cache.backend.FIELD_MAP.iteritems():
if isinstance(col, int): label, fmt = col, lambda x:x
label = self.cache.backend.custom_column_num_map[col]['label'] func = {
label = (self.cache.backend.field_metadata.custom_field_prefix
+ label)
self._field_getters[idx] = partial(self.get, label)
else:
try:
self._field_getters[idx] = {
'id': self._get_id, 'id': self._get_id,
'au_map': self.get_author_data, 'au_map': self.get_author_data,
'ondevice': self.get_ondevice, 'ondevice': self.get_ondevice,
'marked': self.get_marked, 'marked': self.get_marked,
}[col] 'series_sort':self.get_series_sort,
except KeyError: }.get(col, self._get)
self._field_getters[idx] = partial(self.get, col) if isinstance(col, int):
label = self.cache.backend.custom_column_num_map[col]['label']
label = (self.cache.backend.field_metadata.custom_field_prefix
+ label)
if label.endswith('_index'):
try:
num = int(label.partition('_')[0])
except ValueError:
pass # series_index
else:
label = self.cache.backend.custom_column_num_map[num]['label']
label = (self.cache.backend.field_metadata.custom_field_prefix
+ label + '_index')
fm = self.field_metadata[label]
fm
if label == 'authors':
fmt = partial(format_is_multiple, repl='|')
elif label in {'tags', 'languages', 'formats'}:
fmt = format_is_multiple
elif label == 'cover':
fmt = bool
elif label == 'identifiers':
fmt = format_identifiers
elif fm['datatype'] == 'text' and fm['is_multiple']:
sep = fm['is_multiple']['cache_to_list']
if sep not in {'&','|'}:
sep = '|'
fmt = partial(format_is_multiple, sep=sep)
self._field_getters[idx] = partial(func, label, fmt=fmt) if func == self._get else func
self._map = tuple(self.cache.all_book_ids()) self._map = tuple(self.cache.all_book_ids())
self._map_filtered = tuple(self._map) self._map_filtered = tuple(self._map)
def get_property(self, id_or_index, index_is_id=False, loc=-1):
book_id = id_or_index if index_is_id else self._map_filtered[id_or_index]
return self._field_getters[loc](book_id)
@property @property
def field_metadata(self): def field_metadata(self):
return self.cache.field_metadata return self.cache.field_metadata
def _get_id(self, idx, index_is_id=True): def _get_id(self, idx, index_is_id=True):
if index_is_id and idx not in self.cache.all_book_ids():
raise IndexError('No book with id %s present'%idx)
return idx if index_is_id else self.index_to_id(idx) return idx if index_is_id else self.index_to_id(idx)
def __getitem__(self, row): def __getitem__(self, row):
@ -107,9 +152,21 @@ class View(object):
def index_to_id(self, idx): def index_to_id(self, idx):
return self._map_filtered[idx] return self._map_filtered[idx]
def get(self, field, idx, index_is_id=True, default_value=None): def _get(self, field, idx, index_is_id=True, default_value=None, fmt=lambda x:x):
id_ = idx if index_is_id else self.index_to_id(idx) id_ = idx if index_is_id else self.index_to_id(idx)
return self.cache.field_for(field, id_) if index_is_id and id_ not in self.cache.all_book_ids():
raise IndexError('No book with id %s present'%idx)
return fmt(self.cache.field_for(field, id_, default_value=default_value))
def get_series_sort(self, idx, index_is_id=True, default_value=''):
book_id = idx if index_is_id else self.index_to_id(idx)
with self.cache.read_lock:
lang_map = self.cache.fields['languages'].book_value_map
lang = lang_map.get(book_id, None) or None
if lang:
lang = lang[0]
return title_sort(self.cache._field_for('series', book_id, default_value=''),
order=tweaks['title_series_sorting'], lang=lang)
def get_ondevice(self, idx, index_is_id=True, default_value=''): def get_ondevice(self, idx, index_is_id=True, default_value=''):
id_ = idx if index_is_id else self.index_to_id(idx) id_ = idx if index_is_id else self.index_to_id(idx)
@ -119,26 +176,15 @@ class View(object):
id_ = idx if index_is_id else self.index_to_id(idx) id_ = idx if index_is_id else self.index_to_id(idx)
return self.marked_ids.get(id_, default_value) return self.marked_ids.get(id_, default_value)
def get_author_data(self, idx, index_is_id=True, default_value=()): def get_author_data(self, idx, index_is_id=True, default_value=None):
'''
Return author data for all authors of the book identified by idx as a
tuple of dictionaries. The dictionaries should never be empty, unless
there is a bug somewhere. The list could be empty if idx point to an
non existent book, or book with no authors (though again a book with no
authors should never happen).
Each dictionary has the keys: name, sort, link. Link can be an empty
string.
default_value is ignored, this method always returns a tuple
'''
id_ = idx if index_is_id else self.index_to_id(idx) id_ = idx if index_is_id else self.index_to_id(idx)
with self.cache.read_lock: with self.cache.read_lock:
ids = self.cache._field_ids_for('authors', id_) ids = self.cache._field_ids_for('authors', id_)
ans = [] ans = []
for id_ in ids: for id_ in ids:
ans.append(self.cache._author_data(id_)) data = self.cache._author_data(id_)
return tuple(ans) ans.append(':::'.join((data['name'], data['sort'], data['link'])))
return ':#:'.join(ans) if ans else default_value
def multisort(self, fields=[], subsort=False, only_ids=None): def multisort(self, fields=[], subsort=False, only_ids=None):
fields = [(sanitize_sort_field_name(self.field_metadata, x), bool(y)) for x, y in fields] fields = [(sanitize_sort_field_name(self.field_metadata, x), bool(y)) for x, y in fields]
@ -168,8 +214,19 @@ class View(object):
return ans return ans
self._map_filtered = tuple(ans) self._map_filtered = tuple(ans)
def _build_restriction_string(self, restriction):
if self.base_restriction:
if restriction:
return u'(%s) and (%s)' % (self.base_restriction, restriction)
else:
return self.base_restriction
else:
return restriction
def search_getting_ids(self, query, search_restriction, def search_getting_ids(self, query, search_restriction,
set_restriction_count=False): set_restriction_count=False, use_virtual_library=True):
if use_virtual_library:
search_restriction = self._build_restriction_string(search_restriction)
q = '' q = ''
if not query or not query.strip(): if not query or not query.strip():
q = search_restriction q = search_restriction
@ -188,11 +245,32 @@ class View(object):
self.search_restriction_book_count = len(rv) self.search_restriction_book_count = len(rv)
return rv return rv
def get_search_restriction(self):
return self.search_restriction
def set_search_restriction(self, s): def set_search_restriction(self, s):
self.search_restriction = s self.search_restriction = s
def get_base_restriction(self):
return self.base_restriction
def set_base_restriction(self, s):
self.base_restriction = s
def get_base_restriction_name(self):
return self.base_restriction_name
def set_base_restriction_name(self, s):
self.base_restriction_name = s
def get_search_restriction_name(self):
return self.search_restriction_name
def set_search_restriction_name(self, s):
self.search_restriction_name = s
def search_restriction_applied(self): def search_restriction_applied(self):
return bool(self.search_restriction) return bool(self.search_restriction) or bool(self.base_restriction)
def get_search_restriction_book_count(self): def get_search_restriction_book_count(self):
return self.search_restriction_book_count return self.search_restriction_book_count

View File

@ -71,6 +71,7 @@ class ANDROID(USBMS):
0x42f7 : [0x216], 0x42f7 : [0x216],
0x4365 : [0x216], 0x4365 : [0x216],
0x4366 : [0x216], 0x4366 : [0x216],
0x4371 : [0x216],
}, },
# Freescale # Freescale
0x15a2 : { 0x15a2 : {
@ -239,7 +240,7 @@ class ANDROID(USBMS):
'ADVANCED', 'SGH-I727', 'USB_FLASH_DRIVER', 'ANDROID', 'ADVANCED', 'SGH-I727', 'USB_FLASH_DRIVER', 'ANDROID',
'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E', 'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E',
'NOVO7', 'MB526', '_USB#WYK7MSF8KE', 'TABLET_PC', 'F', 'MT65XX_MS', 'NOVO7', 'MB526', '_USB#WYK7MSF8KE', 'TABLET_PC', 'F', 'MT65XX_MS',
'ICS', 'E400', '__FILE-STOR_GADG', 'ST80208-1', 'GT-S5660M_CARD'] 'ICS', 'E400', '__FILE-STOR_GADG', 'ST80208-1', 'GT-S5660M_CARD', 'XT894']
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD', 'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD', 'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
@ -250,7 +251,7 @@ class ANDROID(USBMS):
'FILE-CD_GADGET', 'GT-I9001_CARD', 'USB_2.0', 'XT875', 'FILE-CD_GADGET', 'GT-I9001_CARD', 'USB_2.0', 'XT875',
'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727', 'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727',
'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E', 'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E',
'NOVO7', 'ADVANCED', 'TABLET_PC', 'F', 'E400_SD_CARD', 'ST80208-1'] 'NOVO7', 'ADVANCED', 'TABLET_PC', 'F', 'E400_SD_CARD', 'ST80208-1', 'XT894']
OSX_MAIN_MEM = 'Android Device Main Memory' OSX_MAIN_MEM = 'Android Device Main Memory'

View File

@ -35,11 +35,11 @@ class KOBO(USBMS):
gui_name = 'Kobo Reader' gui_name = 'Kobo Reader'
description = _('Communicate with the Kobo Reader') description = _('Communicate with the Kobo Reader')
author = 'Timothy Legge and David Forrester' author = 'Timothy Legge and David Forrester'
version = (2, 0, 7) version = (2, 0, 8)
dbversion = 0 dbversion = 0
fwversion = 0 fwversion = 0
supported_dbversion = 75 supported_dbversion = 80
has_kepubs = False has_kepubs = False
supported_platforms = ['windows', 'osx', 'linux'] supported_platforms = ['windows', 'osx', 'linux']
@ -419,7 +419,7 @@ class KOBO(USBMS):
# If all this succeeds we need to delete the images files via the ImageID # If all this succeeds we need to delete the images files via the ImageID
return ImageID return ImageID
def delete_images(self, ImageID): def delete_images(self, ImageID, book_path):
if ImageID != None: if ImageID != None:
path_prefix = '.kobo/images/' path_prefix = '.kobo/images/'
path = self._main_prefix + path_prefix + ImageID path = self._main_prefix + path_prefix + ImageID
@ -449,7 +449,7 @@ class KOBO(USBMS):
ImageID = self.delete_via_sql(ContentID, ContentType) ImageID = self.delete_via_sql(ContentID, ContentType)
#print " We would now delete the Images for" + ImageID #print " We would now delete the Images for" + ImageID
self.delete_images(ImageID) self.delete_images(ImageID, path)
if os.path.exists(path): if os.path.exists(path):
# Delete the ebook # Delete the ebook
@ -1204,10 +1204,16 @@ class KOBOTOUCH(KOBO):
description = 'Communicate with the Kobo Touch, Glo and Mini firmware. Based on the existing Kobo driver by %s.' % (KOBO.author) description = 'Communicate with the Kobo Touch, Glo and Mini firmware. Based on the existing Kobo driver by %s.' % (KOBO.author)
# icon = I('devices/kobotouch.jpg') # icon = I('devices/kobotouch.jpg')
supported_dbversion = 75 supported_dbversion = 80
min_supported_dbversion = 53 min_supported_dbversion = 53
min_dbversion_series = 65 min_dbversion_series = 65
min_dbversion_archive = 71 min_dbversion_archive = 71
min_dbversion_images_on_sdcard = 77
max_supported_fwversion = (2,5,1)
min_fwversion_images_on_sdcard = (2,4,1)
has_kepubs = True
booklist_class = KTCollectionsBookList booklist_class = KTCollectionsBookList
book_class = Book book_class = Book
@ -1354,14 +1360,13 @@ class KOBOTOUCH(KOBO):
# Determine the firmware version # Determine the firmware version
try: try:
with open(self.normalize_path(self._main_prefix + '.kobo/version'), with open(self.normalize_path(self._main_prefix + '.kobo/version'), 'rb') as f:
'rb') as f:
self.fwversion = f.readline().split(',')[2] self.fwversion = f.readline().split(',')[2]
self.fwversion = tuple((int(x) for x in self.fwversion.split('.')))
except: except:
self.fwversion = 'unknown' self.fwversion = (0,0,0)
if self.fwversion != '1.0' and self.fwversion != '1.4':
self.has_kepubs = True
debug_print('Version of driver:', self.version, 'Has kepubs:', self.has_kepubs) debug_print('Version of driver:', self.version, 'Has kepubs:', self.has_kepubs)
debug_print('Version of firmware:', self.fwversion, 'Has kepubs:', self.has_kepubs) debug_print('Version of firmware:', self.fwversion, 'Has kepubs:', self.has_kepubs)
@ -1466,6 +1471,7 @@ class KOBOTOUCH(KOBO):
if show_debug: if show_debug:
self.debug_index = idx self.debug_index = idx
debug_print("KoboTouch:update_booklist - idx=%d"%idx) debug_print("KoboTouch:update_booklist - idx=%d"%idx)
debug_print("KoboTouch:update_booklist - lpath=%s"%lpath)
debug_print('KoboTouch:update_booklist - bl[idx].device_collections=', bl[idx].device_collections) debug_print('KoboTouch:update_booklist - bl[idx].device_collections=', bl[idx].device_collections)
debug_print('KoboTouch:update_booklist - playlist_map=', playlist_map) debug_print('KoboTouch:update_booklist - playlist_map=', playlist_map)
debug_print('KoboTouch:update_booklist - bookshelves=', bookshelves) debug_print('KoboTouch:update_booklist - bookshelves=', bookshelves)
@ -1477,7 +1483,7 @@ class KOBOTOUCH(KOBO):
bl_cache[lpath] = None bl_cache[lpath] = None
if ImageID is not None: if ImageID is not None:
imagename = self.imagefilename_from_imageID(ImageID) imagename = self.imagefilename_from_imageID(prefix, ImageID)
if imagename is not None: if imagename is not None:
bl[idx].thumbnail = ImageWrapper(imagename) bl[idx].thumbnail = ImageWrapper(imagename)
if (ContentType == '6' and MimeType != 'application/x-kobo-epub+zip'): if (ContentType == '6' and MimeType != 'application/x-kobo-epub+zip'):
@ -1717,12 +1723,14 @@ class KOBOTOUCH(KOBO):
debug_print("KoboTouch:books - end - oncard='%s'"%oncard) debug_print("KoboTouch:books - end - oncard='%s'"%oncard)
return bl return bl
def imagefilename_from_imageID(self, ImageID): def imagefilename_from_imageID(self, prefix, ImageID):
show_debug = self.is_debugging_title(ImageID) show_debug = self.is_debugging_title(ImageID)
path = self.images_path(prefix)
path = self.normalize_path(path.replace('/', os.sep))
for ending, cover_options in self.cover_file_endings().items(): for ending, cover_options in self.cover_file_endings().items():
fpath = self._main_prefix + '.kobo/images/' + ImageID + ending fpath = path + ImageID + ending
fpath = self.normalize_path(fpath.replace('/', os.sep))
if os.path.exists(fpath): if os.path.exists(fpath):
if show_debug: if show_debug:
debug_print("KoboTouch:imagefilename_from_imageID - have cover image fpath=%s" % (fpath)) debug_print("KoboTouch:imagefilename_from_imageID - have cover image fpath=%s" % (fpath))
@ -1764,7 +1772,7 @@ class KOBOTOUCH(KOBO):
if not self.copying_covers(): if not self.copying_covers():
imageID = self.imageid_from_contentid(contentID) imageID = self.imageid_from_contentid(contentID)
self.delete_images(imageID) self.delete_images(imageID, fname)
connection.commit() connection.commit()
cursor.close() cursor.close()
@ -1821,11 +1829,11 @@ class KOBOTOUCH(KOBO):
return imageId return imageId
def delete_images(self, ImageID): def delete_images(self, ImageID, book_path):
debug_print("KoboTouch:delete_images - ImageID=", ImageID) debug_print("KoboTouch:delete_images - ImageID=", ImageID)
if ImageID != None: if ImageID != None:
path_prefix = '.kobo/images/' path = self.images_path(book_path)
path = self._main_prefix + path_prefix + ImageID path = path + ImageID
for ending in self.cover_file_endings().keys(): for ending in self.cover_file_endings().keys():
fpath = path + ending fpath = path + ending
@ -1872,12 +1880,14 @@ class KOBOTOUCH(KOBO):
def get_content_type_from_extension(self, extension): def get_content_type_from_extension(self, extension):
debug_print("KoboTouch:get_content_type_from_extension - start") debug_print("KoboTouch:get_content_type_from_extension - start")
# With new firmware, ContentType appears to be 6 for all types of sideloaded books. # With new firmware, ContentType appears to be 6 for all types of sideloaded books.
if self.fwversion.startswith('2.'): if self.fwversion >= (1,9,17) or extension == '.kobo' or extension == '.mobi':
debug_print("KoboTouch:get_content_type_from_extension - V2 firmware") debug_print("KoboTouch:get_content_type_from_extension - V2 firmware")
ContentType = 6 ContentType = 6
# For older firmware, it depends on the type of file.
elif extension == '.kobo' or extension == '.mobi':
ContentType = 6
else: else:
debug_print("KoboTouch:get_content_type_from_extension - calling super") ContentType = 901
ContentType = super(KOBOTOUCH, self).get_content_type_from_extension(extension)
return ContentType return ContentType
def update_device_database_collections(self, booklists, collections_attributes, oncard): def update_device_database_collections(self, booklists, collections_attributes, oncard):
@ -2088,8 +2098,8 @@ class KOBOTOUCH(KOBO):
# debug_print('KoboTouch: not uploading cover') # debug_print('KoboTouch: not uploading cover')
return return
# Don't upload covers if book is on the SD card # Only upload covers to SD card if that is supported
if self._card_a_prefix and path.startswith(self._card_a_prefix): if self._card_a_prefix and path.startswith(self._card_a_prefix) and not self.supports_covers_on_sdcard():
return return
if not opts.extra_customization[self.OPT_UPLOAD_GRAYSCALE_COVERS]: if not opts.extra_customization[self.OPT_UPLOAD_GRAYSCALE_COVERS]:
@ -2111,6 +2121,16 @@ class KOBOTOUCH(KOBO):
ImageID = ImageID.replace('.', '_') ImageID = ImageID.replace('.', '_')
return ImageID return ImageID
def images_path(self, path):
if self._card_a_prefix and path.startswith(self._card_a_prefix) and self.supports_covers_on_sdcard():
path_prefix = 'koboExtStorage/images/'
path = self._card_a_prefix + path_prefix
else:
path_prefix = '.kobo/images/'
path = self._main_prefix + path_prefix
return path
def _upload_cover(self, path, filename, metadata, filepath, uploadgrayscale, keep_cover_aspect=False): def _upload_cover(self, path, filename, metadata, filepath, uploadgrayscale, keep_cover_aspect=False):
from calibre.utils.magick.draw import save_cover_data_to, identify_data from calibre.utils.magick.draw import save_cover_data_to, identify_data
debug_print("KoboTouch:_upload_cover - filename='%s' uploadgrayscale='%s' "%(filename, uploadgrayscale)) debug_print("KoboTouch:_upload_cover - filename='%s' uploadgrayscale='%s' "%(filename, uploadgrayscale))
@ -2151,8 +2171,8 @@ class KOBOTOUCH(KOBO):
cursor.close() cursor.close()
if ImageID != None: if ImageID != None:
path_prefix = '.kobo/images/' path = self.images_path(path) + ImageID
path = self._main_prefix + path_prefix + ImageID
if show_debug: if show_debug:
debug_print("KoboTouch:_upload_cover - About to loop over cover endings") debug_print("KoboTouch:_upload_cover - About to loop over cover endings")
@ -2524,6 +2544,52 @@ class KOBOTOUCH(KOBO):
def supports_kobo_archive(self): def supports_kobo_archive(self):
return self.dbversion >= self.min_dbversion_archive return self.dbversion >= self.min_dbversion_archive
def supports_covers_on_sdcard(self):
return self.dbversion >= 77 and self.fwversion >= self.min_fwversion_images_on_sdcard
def modify_database_check(self, function):
# Checks to see whether the database version is supported
# and whether the user has chosen to support the firmware version
# debug_print("KoboTouch:modify_database_check - self.fwversion <= self.max_supported_fwversion=", self.fwversion > self.max_supported_fwversion)
if self.dbversion > self.supported_dbversion or self.fwversion > self.max_supported_fwversion:
# Unsupported database
opts = self.settings()
if not opts.extra_customization[self.OPT_SUPPORT_NEWER_FIRMWARE]:
debug_print('The database has been upgraded past supported version')
self.report_progress(1.0, _('Removing books from device...'))
from calibre.devices.errors import UserFeedback
raise UserFeedback(_("Kobo database version unsupported - See details"),
_('Your Kobo is running an updated firmware/database version.'
' As calibre does not know about this updated firmware,'
' database editing is disabled, to prevent corruption.'
' You can still send books to your Kobo with calibre, '
' but deleting books and managing collections is disabled.'
' If you are willing to experiment and know how to reset'
' your Kobo to Factory defaults, you can override this'
' check by right clicking the device icon in calibre and'
' selecting "Configure this device" and then the '
' "Attempt to support newer firmware" option.'
' Doing so may require you to perform a factory reset of'
' your Kobo.'
),
UserFeedback.WARN)
return False
else:
# The user chose to edit the database anyway
return True
else:
# Supported database version
return True
# @classmethod
# def get_gui_name(cls):
# if hasattr(cls, 'gui_name'):
# return cls.gui_name
# if hasattr(cls, '__name__'):
# return cls.__name__
# return cls.name
@classmethod @classmethod
def is_debugging_title(cls, title): def is_debugging_title(cls, title):

View File

@ -4,12 +4,15 @@ __copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, re, shutil import os, re, shutil
from os.path import dirname, abspath, relpath, exists, basename from os.path import dirname, abspath, relpath as _relpath, exists, basename
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
from calibre import CurrentDir from calibre import CurrentDir
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory
def relpath(*args):
return _relpath(*args).replace(os.sep, '/')
class HTMLOutput(OutputFormatPlugin): class HTMLOutput(OutputFormatPlugin):
name = 'HTML Output' name = 'HTML Output'

View File

@ -97,6 +97,12 @@ class TXTInput(InputFormatPlugin):
if not ienc: if not ienc:
ienc = 'utf-8' ienc = 'utf-8'
log.debug('No input encoding specified and could not auto detect using %s' % ienc) log.debug('No input encoding specified and could not auto detect using %s' % ienc)
# Remove BOM from start of txt as its presence can confuse markdown
import codecs
for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
if txt.startswith(bom):
txt = txt[len(bom):]
break
txt = txt.decode(ienc, 'replace') txt = txt.decode(ienc, 'replace')
# Replace entities # Replace entities

View File

@ -68,7 +68,6 @@ class Resource(object): # {{{
self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep))) self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
self.fragment = url[-1] self.fragment = url[-1]
def href(self, basedir=None): def href(self, basedir=None):
''' '''
Return a URL pointing to this resource. If it is a file on the filesystem Return a URL pointing to this resource. If it is a file on the filesystem
@ -180,7 +179,6 @@ class ManifestItem(Resource): # {{{
self.mime_type = val self.mime_type = val
return property(fget=fget, fset=fset) return property(fget=fget, fset=fset)
def __unicode__(self): def __unicode__(self):
return u'<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href(), self.media_type) return u'<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href(), self.media_type)
@ -190,7 +188,6 @@ class ManifestItem(Resource): # {{{
def __repr__(self): def __repr__(self):
return unicode(self) return unicode(self)
def __getitem__(self, index): def __getitem__(self, index):
if index == 0: if index == 0:
return self.href() return self.href()
@ -245,7 +242,6 @@ class Manifest(ResourceCollection): # {{{
ResourceCollection.__init__(self) ResourceCollection.__init__(self)
self.next_id = 1 self.next_id = 1
def item(self, id): def item(self, id):
for i in self: for i in self:
if i.id == id: if i.id == id:
@ -309,13 +305,10 @@ class Spine(ResourceCollection): # {{{
continue continue
return s return s
def __init__(self, manifest): def __init__(self, manifest):
ResourceCollection.__init__(self) ResourceCollection.__init__(self)
self.manifest = manifest self.manifest = manifest
def replace(self, start, end, ids): def replace(self, start, end, ids):
''' '''
Replace the items between start (inclusive) and end (not inclusive) with Replace the items between start (inclusive) and end (not inclusive) with
@ -363,7 +356,6 @@ class Guide(ResourceCollection): # {{{
ans += 'title="%s" '%self.title ans += 'title="%s" '%self.title
return ans + '/>' return ans + '/>'
@staticmethod @staticmethod
def from_opf_guide(references, base_dir=os.getcwdu()): def from_opf_guide(references, base_dir=os.getcwdu()):
coll = Guide() coll = Guide()
@ -501,9 +493,10 @@ class OPF(object): # {{{
CONTENT = XPath('self::*[re:match(name(), "meta$", "i")]/@content') CONTENT = XPath('self::*[re:match(name(), "meta$", "i")]/@content')
TEXT = XPath('string()') TEXT = XPath('string()')
metadata_path = XPath('descendant::*[re:match(name(), "metadata", "i")]') metadata_path = XPath('descendant::*[re:match(name(), "metadata", "i")]')
metadata_elem_path = XPath('descendant::*[re:match(name(), concat($name, "$"), "i") or (re:match(name(), "meta$", "i") and re:match(@name, concat("^calibre:", $name, "$"), "i"))]') metadata_elem_path = XPath(
'descendant::*[re:match(name(), concat($name, "$"), "i") or (re:match(name(), "meta$", "i") '
'and re:match(@name, concat("^calibre:", $name, "$"), "i"))]')
title_path = XPath('descendant::*[re:match(name(), "title", "i")]') title_path = XPath('descendant::*[re:match(name(), "title", "i")]')
authors_path = XPath('descendant::*[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut" or (not(@role) and not(@opf:role)))]') authors_path = XPath('descendant::*[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut" or (not(@role) and not(@opf:role)))]')
bkp_path = XPath('descendant::*[re:match(name(), "contributor", "i") and (@role="bkp" or @opf:role="bkp")]') bkp_path = XPath('descendant::*[re:match(name(), "contributor", "i") and (@role="bkp" or @opf:role="bkp")]')
@ -640,7 +633,8 @@ class OPF(object): # {{{
if 'toc' in item.href().lower(): if 'toc' in item.href().lower():
toc = item.path toc = item.path
if toc is None: return if toc is None:
return
self.toc = TOC(base_path=self.base_dir) self.toc = TOC(base_path=self.base_dir)
is_ncx = getattr(self, 'manifest', None) is not None and \ is_ncx = getattr(self, 'manifest', None) is not None and \
self.manifest.type_for_id(toc) is not None and \ self.manifest.type_for_id(toc) is not None and \
@ -976,7 +970,6 @@ class OPF(object): # {{{
return property(fget=fget, fset=fset) return property(fget=fget, fset=fset)
@dynamic_property @dynamic_property
def language(self): def language(self):
@ -990,7 +983,6 @@ class OPF(object): # {{{
return property(fget=fget, fset=fset) return property(fget=fget, fset=fset)
@dynamic_property @dynamic_property
def languages(self): def languages(self):
@ -1015,7 +1007,6 @@ class OPF(object): # {{{
return property(fget=fget, fset=fset) return property(fget=fget, fset=fset)
@dynamic_property @dynamic_property
def book_producer(self): def book_producer(self):
@ -1196,7 +1187,6 @@ class OPFCreator(Metadata):
if self.cover: if self.cover:
self.guide.set_cover(self.cover) self.guide.set_cover(self.cover)
def create_manifest(self, entries): def create_manifest(self, entries):
''' '''
Create <manifest> Create <manifest>

View File

@ -132,7 +132,7 @@ class Worker(Thread): # Get details {{{
text()="Détails sur le produit" or \ text()="Détails sur le produit" or \
text()="Detalles del producto" or \ text()="Detalles del producto" or \
text()="Detalhes do produto" or \ text()="Detalhes do produto" or \
text()="登録情報"]/../div[@class="content"] starts-with(text(), "登録情報")]/../div[@class="content"]
''' '''
# Editor: is for Spanish # Editor: is for Spanish
self.publisher_xpath = ''' self.publisher_xpath = '''
@ -235,6 +235,12 @@ class Worker(Thread): # Get details {{{
msg = 'Failed to parse amazon details page: %r'%self.url msg = 'Failed to parse amazon details page: %r'%self.url
self.log.exception(msg) self.log.exception(msg)
return return
if self.domain == 'jp':
for a in root.xpath('//a[@href]'):
if 'black-curtain-redirect.html' in a.get('href'):
self.url = 'http://amazon.co.jp'+a.get('href')
self.log('Black curtain redirect found, following')
return self.get_details()
errmsg = root.xpath('//*[@id="errorMessage"]') errmsg = root.xpath('//*[@id="errorMessage"]')
if errmsg: if errmsg:
@ -252,8 +258,8 @@ class Worker(Thread): # Get details {{{
self.log.exception('Error parsing asin for url: %r'%self.url) self.log.exception('Error parsing asin for url: %r'%self.url)
asin = None asin = None
if self.testing: if self.testing:
import tempfile import tempfile, uuid
with tempfile.NamedTemporaryFile(prefix=asin + '_', with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
suffix='.html', delete=False) as f: suffix='.html', delete=False) as f:
f.write(raw) f.write(raw)
print ('Downloaded html for', asin, 'saved in', f.name) print ('Downloaded html for', asin, 'saved in', f.name)
@ -270,7 +276,6 @@ class Worker(Thread): # Get details {{{
self.log.exception('Error parsing authors for url: %r'%self.url) self.log.exception('Error parsing authors for url: %r'%self.url)
authors = [] authors = []
if not title or not authors or not asin: if not title or not authors or not asin:
self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('Could not find title/authors/asin for %r'%self.url)
self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
@ -425,7 +430,6 @@ class Worker(Thread): # Get details {{{
desc = re.sub(r'(?s)<!--.*?-->', '', desc) desc = re.sub(r'(?s)<!--.*?-->', '', desc)
return sanitize_comments_html(desc) return sanitize_comments_html(desc)
def parse_comments(self, root): def parse_comments(self, root):
ans = '' ans = ''
desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
@ -499,7 +503,7 @@ class Worker(Thread): # Get details {{{
def parse_language(self, pd): def parse_language(self, pd):
for x in reversed(pd.xpath(self.language_xpath)): for x in reversed(pd.xpath(self.language_xpath)):
if x.tail: if x.tail:
raw = x.tail.strip() raw = x.tail.strip().partition(',')[0].strip()
ans = self.lang_map.get(raw, None) ans = self.lang_map.get(raw, None)
if ans: if ans:
return ans return ans
@ -631,7 +635,6 @@ class Amazon(Source):
mi.tags = list(map(fixcase, mi.tags)) mi.tags = list(map(fixcase, mi.tags))
mi.isbn = check_isbn(mi.isbn) mi.isbn = check_isbn(mi.isbn)
def create_query(self, log, title=None, authors=None, identifiers={}, # {{{ def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
domain=None): domain=None):
if domain is None: if domain is None:
@ -718,7 +721,10 @@ class Amazon(Source):
def title_ok(title): def title_ok(title):
title = title.lower() title = title.lower()
for x in ('bulk pack', '[audiobook]', '[audio cd]'): bad = ['bulk pack', '[audiobook]', '[audio cd]']
if self.domain == 'com':
bad.append('(spanish edition)')
for x in bad:
if x in title: if x in title:
return False return False
return True return True
@ -745,7 +751,6 @@ class Amazon(Source):
matches.append(a.get('href')) matches.append(a.get('href'))
break break
# Keep only the top 5 matches as the matches are sorted by relevance by # Keep only the top 5 matches as the matches are sorted by relevance by
# Amazon so lower matches are not likely to be very relevant # Amazon so lower matches are not likely to be very relevant
return matches[:5] return matches[:5]
@ -789,7 +794,6 @@ class Amazon(Source):
log.exception(msg) log.exception(msg)
return as_unicode(msg) return as_unicode(msg)
raw = clean_ascii_chars(xml_to_unicode(raw, raw = clean_ascii_chars(xml_to_unicode(raw,
strip_encoding_pats=True, resolve_entities=True)[0]) strip_encoding_pats=True, resolve_entities=True)[0])
@ -819,7 +823,6 @@ class Amazon(Source):
# The error is almost always a not found error # The error is almost always a not found error
found = False found = False
if found: if found:
matches = self.parse_results_page(root) matches = self.parse_results_page(root)
@ -901,6 +904,11 @@ if __name__ == '__main__': # tests {{{
isbn_test, title_test, authors_test, comments_test, series_test) isbn_test, title_test, authors_test, comments_test, series_test)
com_tests = [ # {{{ com_tests = [ # {{{
( # Has a spanish edition
{'title':'11/22/63'},
[title_test('11/22/63: A Novel', exact=True), authors_test(['Stephen King']),]
),
( # + in title and uses id="main-image" for cover ( # + in title and uses id="main-image" for cover
{'title':'C++ Concurrency in Action'}, {'title':'C++ Concurrency in Action'},
[title_test('C++ Concurrency in Action: Practical Multithreading', [title_test('C++ Concurrency in Action: Practical Multithreading',
@ -911,8 +919,8 @@ if __name__ == '__main__': # tests {{{
( # Series ( # Series
{'identifiers':{'amazon':'0756407117'}}, {'identifiers':{'amazon':'0756407117'}},
[title_test( [title_test(
"Throne of the Crescent Moon" "Throne of the Crescent Moon",
, exact=True), series_test('Crescent Moon Kingdoms', 1), exact=True), series_test('Crescent Moon Kingdoms', 1),
comments_test('Makhslood'), comments_test('Makhslood'),
] ]
), ),
@ -920,8 +928,8 @@ if __name__ == '__main__': # tests {{{
( # Different comments markup, using Book Description section ( # Different comments markup, using Book Description section
{'identifiers':{'amazon':'0982514506'}}, {'identifiers':{'amazon':'0982514506'}},
[title_test( [title_test(
"Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy" "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy",
, exact=True), exact=True),
comments_test('Jelena'), comments_test('Leslie'), comments_test('Jelena'), comments_test('Leslie'),
] ]
), ),
@ -1004,6 +1012,11 @@ if __name__ == '__main__': # tests {{{
] # }}} ] # }}}
jp_tests = [ # {{{ jp_tests = [ # {{{
( # Adult filtering test
{'identifiers':{'isbn':'4799500066'}},
[title_test(u' '),]
),
( # isbn -> title, authors ( # isbn -> title, authors
{'identifiers':{'isbn': '9784101302720'}}, {'identifiers':{'isbn': '9784101302720'}},
[title_test(u'精霊の守り人', [title_test(u'精霊の守り人',

View File

@ -31,7 +31,7 @@ msprefs.defaults['find_first_edition_date'] = False
# Google covers are often poor quality (scans/errors) but they have high # Google covers are often poor quality (scans/errors) but they have high
# resolution, so they trump covers from better sources. So make sure they # resolution, so they trump covers from better sources. So make sure they
# are only used if no other covers are found. # are only used if no other covers are found.
msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2} msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2, 'Big Book Search':2}
def create_log(ostream=None): def create_log(ostream=None):
from calibre.utils.logging import ThreadSafeLog, FileStream from calibre.utils.logging import ThreadSafeLog, FileStream
@ -429,6 +429,40 @@ class Source(Plugin):
mi.tags = list(map(fixcase, mi.tags)) mi.tags = list(map(fixcase, mi.tags))
mi.isbn = check_isbn(mi.isbn) mi.isbn = check_isbn(mi.isbn)
def download_multiple_covers(self, title, authors, urls, get_best_cover, timeout, result_queue, abort, log, prefs_name='max_covers'):
if not urls:
log('No images found for, title: %r and authors: %r'%(title, authors))
return
from threading import Thread
import time
if prefs_name:
urls = urls[:self.prefs[prefs_name]]
if get_best_cover:
urls = urls[:1]
log('Downloading %d covers'%len(urls))
workers = [Thread(target=self.download_image, args=(u, timeout, log, result_queue)) for u in urls]
for w in workers:
w.daemon = True
w.start()
alive = True
start_time = time.time()
while alive and not abort.is_set() and time.time() - start_time < timeout:
alive = False
for w in workers:
if w.is_alive():
alive = True
break
abort.wait(0.1)
def download_image(self, url, timeout, log, result_queue):
try:
ans = self.browser.open_novisit(url, timeout=timeout).read()
result_queue.put((self, ans))
log('Downloaded cover from: %s'%url)
except Exception:
self.log.exception('Failed to download cover from: %r'%url)
# }}} # }}}
# Metadata API {{{ # Metadata API {{{

View File

@ -0,0 +1,58 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.metadata.sources.base import Source, Option
def get_urls(br, tokens):
from urllib import quote_plus
from mechanize import Request
from lxml import html
escaped = [quote_plus(x.encode('utf-8')) for x in tokens if x and x.strip()]
q = b'+'.join(escaped)
url = 'http://bigbooksearch.com/books/'+q
br.open(url).read()
req = Request('http://bigbooksearch.com/query.php?SearchIndex=books&Keywords=%s&ItemPage=1'%q)
req.add_header('X-Requested-With', 'XMLHttpRequest')
req.add_header('Referer', url)
raw = br.open(req).read()
root = html.fromstring(raw.decode('utf-8'))
urls = [i.get('src') for i in root.xpath('//img[@src]')]
return urls
class BigBookSearch(Source):
name = 'Big Book Search'
description = _('Downloads multiple book covers from Amazon. Useful to find alternate covers.')
capabilities = frozenset(['cover'])
config_help_message = _('Configure the Big Book Search plugin')
can_get_multiple_covers = True
options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
_('The maximum number of covers to process from the search result')),
)
supports_gzip_transfer_encoding = True
def download_cover(self, log, result_queue, abort,
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
if not title:
return
br = self.browser
tokens = tuple(self.get_title_tokens(title)) + tuple(self.get_author_tokens(authors))
urls = get_urls(br, tokens)
self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
def test():
from calibre import browser
import pprint
br = browser()
urls = get_urls(br, ['consider', 'phlebas', 'banks'])
pprint.pprint(urls)
if __name__ == '__main__':
test()

View File

@ -18,12 +18,13 @@ from calibre.utils.magick.draw import Image, save_cover_data_to
class Worker(Thread): class Worker(Thread):
def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq): def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq, get_best_cover=False):
Thread.__init__(self) Thread.__init__(self)
self.daemon = True self.daemon = True
self.plugin = plugin self.plugin = plugin
self.abort = abort self.abort = abort
self.get_best_cover = get_best_cover
self.buf = BytesIO() self.buf = BytesIO()
self.log = create_log(self.buf) self.log = create_log(self.buf)
self.title, self.authors, self.identifiers = (title, authors, self.title, self.authors, self.identifiers = (title, authors,
@ -37,7 +38,7 @@ class Worker(Thread):
try: try:
if self.plugin.can_get_multiple_covers: if self.plugin.can_get_multiple_covers:
self.plugin.download_cover(self.log, self.rq, self.abort, self.plugin.download_cover(self.log, self.rq, self.abort,
title=self.title, authors=self.authors, get_best_cover=True, title=self.title, authors=self.authors, get_best_cover=self.get_best_cover,
identifiers=self.identifiers, timeout=self.timeout) identifiers=self.identifiers, timeout=self.timeout)
else: else:
self.plugin.download_cover(self.log, self.rq, self.abort, self.plugin.download_cover(self.log, self.rq, self.abort,
@ -72,7 +73,7 @@ def process_result(log, result):
return (plugin, width, height, fmt, data) return (plugin, width, height, fmt, data)
def run_download(log, results, abort, def run_download(log, results, abort,
title=None, authors=None, identifiers={}, timeout=30): title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
''' '''
Run the cover download, putting results into the queue :param:`results`. Run the cover download, putting results into the queue :param:`results`.
@ -89,7 +90,7 @@ def run_download(log, results, abort,
plugins = [p for p in metadata_plugins(['cover']) if p.is_configured()] plugins = [p for p in metadata_plugins(['cover']) if p.is_configured()]
rq = Queue() rq = Queue()
workers = [Worker(p, abort, title, authors, identifiers, timeout, rq) for p workers = [Worker(p, abort, title, authors, identifiers, timeout, rq, get_best_cover=get_best_cover) for p
in plugins] in plugins]
for w in workers: for w in workers:
w.start() w.start()
@ -163,7 +164,7 @@ def download_cover(log,
abort = Event() abort = Event()
run_download(log, rq, abort, title=title, authors=authors, run_download(log, rq, abort, title=title, authors=authors,
identifiers=identifiers, timeout=timeout) identifiers=identifiers, timeout=timeout, get_best_cover=True)
results = [] results = []

View File

@ -106,6 +106,8 @@ class Worker(Thread): # {{{
parts = pub.partition(':')[0::2] parts = pub.partition(':')[0::2]
pub = parts[1] or parts[0] pub = parts[1] or parts[0]
try: try:
if ', Ship Date:' in pub:
pub = pub.partition(', Ship Date:')[0]
q = parse_only_date(pub, assume_utc=True) q = parse_only_date(pub, assume_utc=True)
if q.year != UNDEFINED_DATE: if q.year != UNDEFINED_DATE:
mi.pubdate = q mi.pubdate = q

View File

@ -39,39 +39,11 @@ class GoogleImages(Source):
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
if not title: if not title:
return return
from threading import Thread
import time
timeout = max(60, timeout) # Needs at least a minute timeout = max(60, timeout) # Needs at least a minute
title = ' '.join(self.get_title_tokens(title)) title = ' '.join(self.get_title_tokens(title))
author = ' '.join(self.get_author_tokens(authors)) author = ' '.join(self.get_author_tokens(authors))
urls = self.get_image_urls(title, author, log, abort, timeout) urls = self.get_image_urls(title, author, log, abort, timeout)
if not urls: self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
log('No images found in Google for, title: %r and authors: %r'%(title, author))
return
urls = urls[:self.prefs['max_covers']]
if get_best_cover:
urls = urls[:1]
workers = [Thread(target=self.download_image, args=(url, timeout, log, result_queue)) for url in urls]
for w in workers:
w.daemon = True
w.start()
alive = True
start_time = time.time()
while alive and not abort.is_set() and time.time() - start_time < timeout:
alive = False
for w in workers:
if w.is_alive():
alive = True
break
abort.wait(0.1)
def download_image(self, url, timeout, log, result_queue):
try:
ans = self.browser.open_novisit(url, timeout=timeout).read()
result_queue.put((self, ans))
log('Downloaded cover from: %s'%url)
except Exception:
self.log.exception('Failed to download cover from: %r'%url)
def get_image_urls(self, title, author, log, abort, timeout): def get_image_urls(self, title, author, log, abort, timeout):
from calibre.utils.ipc.simple_worker import fork_job, WorkerError from calibre.utils.ipc.simple_worker import fork_job, WorkerError

View File

@ -51,9 +51,11 @@ def reverse_tag_iter(block):
end = len(block) end = len(block)
while True: while True:
pgt = block.rfind(b'>', 0, end) pgt = block.rfind(b'>', 0, end)
if pgt == -1: break if pgt == -1:
break
plt = block.rfind(b'<', 0, pgt) plt = block.rfind(b'<', 0, pgt)
if plt == -1: break if plt == -1:
break
yield block[plt:pgt+1] yield block[plt:pgt+1]
end = plt end = plt
@ -231,12 +233,12 @@ class Mobi8Reader(object):
flowpart = self.flows[j] flowpart = self.flows[j]
nstr = '%04d' % j nstr = '%04d' % j
m = svg_tag_pattern.search(flowpart) m = svg_tag_pattern.search(flowpart)
if m != None: if m is not None:
# svg # svg
typ = 'svg' typ = 'svg'
start = m.start() start = m.start()
m2 = image_tag_pattern.search(flowpart) m2 = image_tag_pattern.search(flowpart)
if m2 != None: if m2 is not None:
format = 'inline' format = 'inline'
dir = None dir = None
fname = None fname = None
@ -406,6 +408,10 @@ class Mobi8Reader(object):
else: else:
imgtype = what(None, data) imgtype = what(None, data)
if imgtype is None: if imgtype is None:
from calibre.utils.magick.draw import identify_data
try:
imgtype = identify_data(data)[2]
except Exception:
imgtype = 'unknown' imgtype = 'unknown'
href = 'images/%05d.%s'%(fname_idx, imgtype) href = 'images/%05d.%s'%(fname_idx, imgtype)
with open(href.replace('/', os.sep), 'wb') as f: with open(href.replace('/', os.sep), 'wb') as f:

View File

@ -72,7 +72,8 @@ def explode(path, dest, question=lambda x:True):
dest), no_output=True)['result'] dest), no_output=True)['result']
def set_cover(oeb): def set_cover(oeb):
if 'cover' not in oeb.guide or oeb.metadata['cover']: return if 'cover' not in oeb.guide or oeb.metadata['cover']:
return
cover = oeb.guide['cover'] cover = oeb.guide['cover']
if cover.href in oeb.manifest.hrefs: if cover.href in oeb.manifest.hrefs:
item = oeb.manifest.hrefs[cover.href] item = oeb.manifest.hrefs[cover.href]
@ -95,8 +96,9 @@ def rebuild(src_dir, dest_path):
if not opf: if not opf:
raise ValueError('No OPF file found in %s'%src_dir) raise ValueError('No OPF file found in %s'%src_dir)
opf = opf[0] opf = opf[0]
# For debugging, uncomment the following line # For debugging, uncomment the following two lines
# def fork_job(a, b, args=None, no_output=True): do_rebuild(*args) # def fork_job(a, b, args=None, no_output=True):
# do_rebuild(*args)
fork_job('calibre.ebooks.mobi.tweak', 'do_rebuild', args=(opf, dest_path), fork_job('calibre.ebooks.mobi.tweak', 'do_rebuild', args=(opf, dest_path),
no_output=True) no_output=True)

View File

@ -69,7 +69,8 @@ class Resources(object):
cover_href = item.href cover_href = item.href
for item in self.oeb.manifest.values(): for item in self.oeb.manifest.values():
if item.media_type not in OEB_RASTER_IMAGES: continue if item.media_type not in OEB_RASTER_IMAGES:
continue
try: try:
data = self.process_image(item.data) data = self.process_image(item.data)
except: except:
@ -116,8 +117,8 @@ class Resources(object):
Add any images that were created after the call to add_resources() Add any images that were created after the call to add_resources()
''' '''
for item in self.oeb.manifest.values(): for item in self.oeb.manifest.values():
if (item.media_type not in OEB_RASTER_IMAGES or item.href in if (item.media_type not in OEB_RASTER_IMAGES or item.href in self.item_map):
self.item_map): continue continue
try: try:
data = self.process_image(item.data) data = self.process_image(item.data)
except: except:

View File

@ -270,7 +270,7 @@ BINARY_MIME = 'application/octet-stream'
XHTML_CSS_NAMESPACE = u'@namespace "%s";\n' % XHTML_NS XHTML_CSS_NAMESPACE = u'@namespace "%s";\n' % XHTML_NS
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css', 'xhtml/css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME,
'text/x-oeb-document']) 'text/x-oeb-document'])
OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME]) OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME])

View File

@ -77,7 +77,7 @@ class Container(object):
# Map of relative paths with '/' separators from root of unzipped ePub # Map of relative paths with '/' separators from root of unzipped ePub
# to absolute paths on filesystem with os-specific separators # to absolute paths on filesystem with os-specific separators
opfpath = os.path.abspath(opfpath) opfpath = os.path.abspath(os.path.realpath(opfpath))
for dirpath, _dirnames, filenames in os.walk(self.root): for dirpath, _dirnames, filenames in os.walk(self.root):
for f in filenames: for f in filenames:
path = join(dirpath, f) path = join(dirpath, f)
@ -407,7 +407,8 @@ class Container(object):
remove.add(child) remove.add(child)
except AttributeError: except AttributeError:
continue # Happens for XML comments continue # Happens for XML comments
for child in remove: mdata.remove(child) for child in remove:
mdata.remove(child)
if len(mdata) > 0: if len(mdata) > 0:
mdata[-1].tail = '\n ' mdata[-1].tail = '\n '
@ -483,7 +484,7 @@ class EpubContainer(Container):
def __init__(self, pathtoepub, log): def __init__(self, pathtoepub, log):
self.pathtoepub = pathtoepub self.pathtoepub = pathtoepub
tdir = self.root = PersistentTemporaryDirectory('_epub_container') tdir = self.root = os.path.abspath(os.path.realpath(PersistentTemporaryDirectory('_epub_container')))
with open(self.pathtoepub, 'rb') as stream: with open(self.pathtoepub, 'rb') as stream:
try: try:
zf = ZipFile(stream) zf = ZipFile(stream)
@ -616,7 +617,7 @@ class AZW3Container(Container):
def __init__(self, pathtoazw3, log): def __init__(self, pathtoazw3, log):
self.pathtoazw3 = pathtoazw3 self.pathtoazw3 = pathtoazw3
tdir = self.root = PersistentTemporaryDirectory('_azw3_container') tdir = self.root = os.path.abspath(os.path.realpath(PersistentTemporaryDirectory('_azw3_container')))
with open(pathtoazw3, 'rb') as stream: with open(pathtoazw3, 'rb') as stream:
raw = stream.read(3) raw = stream.read(3)
if raw == b'TPZ': if raw == b'TPZ':
@ -670,7 +671,8 @@ class AZW3Container(Container):
# }}} # }}}
def get_container(path, log=None): def get_container(path, log=None):
if log is None: log = default_log if log is None:
log = default_log
ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'} ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
else EpubContainer)(path, log) else EpubContainer)(path, log)
return ebook return ebook

View File

@ -46,7 +46,8 @@ def is_raster_image(media_type):
return media_type and media_type.lower() in { return media_type and media_type.lower() in {
'image/png', 'image/jpeg', 'image/jpg', 'image/gif'} 'image/png', 'image/jpeg', 'image/jpg', 'image/gif'}
COVER_TYPES = { 'coverimagestandard', 'other.ms-coverimage-standard', COVER_TYPES = {
'coverimagestandard', 'other.ms-coverimage-standard',
'other.ms-titleimage-standard', 'other.ms-titleimage', 'other.ms-titleimage-standard', 'other.ms-titleimage',
'other.ms-coverimage', 'other.ms-thumbimage-standard', 'other.ms-coverimage', 'other.ms-thumbimage-standard',
'other.ms-thumbimage', 'thumbimagestandard', 'cover'} 'other.ms-thumbimage', 'thumbimagestandard', 'cover'}
@ -92,7 +93,8 @@ def find_cover_page(container):
def find_cover_image_in_page(container, cover_page): def find_cover_image_in_page(container, cover_page):
root = container.parsed(cover_page) root = container.parsed(cover_page)
body = XPath('//h:body')(root) body = XPath('//h:body')(root)
if len(body) != 1: return if len(body) != 1:
return
body = body[0] body = body[0]
images = [] images = []
for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body): for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body):
@ -179,7 +181,7 @@ def create_epub_cover(container, cover_path):
guide = container.opf_get_or_create('guide') guide = container.opf_get_or_create('guide')
container.insert_into_xml(guide, guide.makeelement( container.insert_into_xml(guide, guide.makeelement(
OPF('reference'), type='cover', title=_('Cover'), OPF('reference'), type='cover', title=_('Cover'),
href=container.name_to_href(titlepage))) href=container.name_to_href(titlepage, base=container.opf_name)))
metadata = container.opf_get_or_create('metadata') metadata = container.opf_get_or_create('metadata')
meta = metadata.makeelement(OPF('meta'), name='cover') meta = metadata.makeelement(OPF('meta'), name='cover')
meta.set('content', raster_cover_item.get('id')) meta.set('content', raster_cover_item.get('id'))

View File

@ -43,8 +43,8 @@ sizes, adjust margins, etc. Every action performs only the minimum set of
changes needed for the desired effect.</p> changes needed for the desired effect.</p>
<p>You should use this tool as the last step in your ebook creation process.</p> <p>You should use this tool as the last step in your ebook creation process.</p>
{0}
<p>Note that polishing only works on files in the %s formats.</p> <p>Note that polishing only works on files in the %s formats.</p>\
''')%_(' or ').join('<b>%s</b>'%x for x in SUPPORTED), ''')%_(' or ').join('<b>%s</b>'%x for x in SUPPORTED),
'subset': _('''\ 'subset': _('''\
@ -69,7 +69,7 @@ text might not be covered by the subset font.</p>
'jacket': _('''\ 'jacket': _('''\
<p>Insert a "book jacket" page at the start of the book that contains <p>Insert a "book jacket" page at the start of the book that contains
all the book metadata such as title, tags, authors, series, comments, all the book metadata such as title, tags, authors, series, comments,
etc.</p>'''), etc. Any previous book jacket will be replaced.</p>'''),
'remove_jacket': _('''\ 'remove_jacket': _('''\
<p>Remove a previous inserted book jacket page.</p> <p>Remove a previous inserted book jacket page.</p>
@ -85,7 +85,7 @@ when single quotes at the start of contractions are involved.</p>
def hfix(name, raw): def hfix(name, raw):
if name == 'about': if name == 'about':
return raw return raw.format('')
raw = raw.replace('\n\n', '__XX__') raw = raw.replace('\n\n', '__XX__')
raw = raw.replace('\n', ' ') raw = raw.replace('\n', ' ')
raw = raw.replace('__XX__', '\n') raw = raw.replace('__XX__', '\n')
@ -175,7 +175,7 @@ def gui_polish(data):
if not data.pop('metadata'): if not data.pop('metadata'):
data.pop('opf') data.pop('opf')
if not data.pop('do_cover'): if not data.pop('do_cover'):
data.pop('cover') data.pop('cover', None)
file_map = {x:x for x in files} file_map = {x:x for x in files}
opts = ALL_OPTS.copy() opts = ALL_OPTS.copy()
opts.update(data) opts.update(data)

View File

@ -51,7 +51,7 @@ class Links(object):
for link in self.links: for link in self.links:
path, href, frag = link[0] path, href, frag = link[0]
page, rect = link[1:] page, rect = link[1:]
combined_path = os.path.abspath(os.path.join(os.path.dirname(path), *href.split('/'))) combined_path = os.path.abspath(os.path.join(os.path.dirname(path), *unquote(href).split('/')))
is_local = not href or combined_path in self.anchors is_local = not href or combined_path in self.anchors
annot = Dictionary({ annot = Dictionary({
'Type':Name('Annot'), 'Subtype':Name('Link'), 'Type':Name('Annot'), 'Subtype':Name('Link'),

View File

@ -180,5 +180,6 @@ class BorderParse:
elif 'single' in border_style_list: elif 'single' in border_style_list:
new_border_dict[att] = 'single' new_border_dict[att] = 'single'
else: else:
if border_style_list:
new_border_dict[att] = border_style_list[0] new_border_dict[att] = border_style_list[0]
return new_border_dict return new_border_dict

Some files were not shown because too many files have changed in this diff Show More