Sync to trunk.

This commit is contained in:
John Schember 2010-02-07 09:18:48 -05:00
commit 3afcb3b2a8
34 changed files with 775 additions and 101 deletions

View File

@ -27,7 +27,7 @@ p.tags {
p.description { p.description {
text-align:left; text-align:left;
font-style:italic; font-style:normal;
margin-top: 0em; margin-top: 0em;
} }
@ -55,6 +55,14 @@ p.author_index {
text-indent: 0em; text-indent: 0em;
} }
p.series {
text-align: left;
margin-top:0px;
margin-bottom:0px;
margin-left:2em;
text-indent:-2em;
}
p.read_book { p.read_book {
text-align:left; text-align:left;
margin-top:0px; margin-top:0px;

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 764 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 640 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 816 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 810 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 783 B

View File

@ -0,0 +1,45 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class ZiveRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = 'Abelturd'
language = 'sk'
version = 1
title = u'ZIVE.sk'
publisher = u''
category = u'News, Newspaper'
description = u'Naj\u010d\xedtanej\u0161\xed denn\xedk opo\u010d\xedta\u010doch, IT a internete. '
encoding = 'UTF-8'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
remove_empty_feeds = True
no_stylesheets = True
remove_javascript = True
cover_url = 'http://www.zive.sk/Client.Images/Logos/logo-zive-sk.gif'
feeds = []
feeds.append((u'V\u0161etky \u010dl\xe1nky', u'http://www.zive.sk/rss/sc-47/default.aspx'))
preprocess_regexps = [
(re.compile(r'<p><p><strong>Pokra.*ie</strong></p>', re.DOTALL|re.IGNORECASE),
lambda match: ''),
]
remove_tags = []
keep_only_tags = [dict(name='h1'), dict(name='span', attrs={'class':'arlist-data-info-author'}), dict(name='div', attrs={'class':'bbtext font-resizer-area'}),]
extra_css = '''
h1 {font-size:140%;font-family:georgia,serif; font-weight:bold}
h3 {font-size:115%;font-family:georgia,serif; font-weight:bold}
'''

View File

@ -0,0 +1,43 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
www.digitalspy.co.uk
'''
from calibre.web.feeds.news import BasicNewsRecipe
class DigitalSpyUK(BasicNewsRecipe):
title = 'Digital Spy - UK Edition'
__author__ = 'Darko Miletic'
description = 'Entertainment news about the biggest TV shows, films and celebrities, updated around the clock.'
publisher = 'Digital Spy Limited.'
category = 'news, showbiz, big brother, x factor, torchwood, doctor who, tv, media, sky, freeview, cable'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'en_GB'
remove_empty_feeds = True
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .info{font-size: small} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [dict(name=['link'])]
remove_attributes = ['height','width']
keep_only_tags = [dict(name='div',attrs={'id':'article'})]
feeds = [
(u'News' , u'http://www.digitalspy.co.uk/rss/zones/gb/all.xml' )
,(u'Big Brother' , u'http://www.digitalspy.co.uk/rss/zones/gb/bigbrother.xml' )
,(u'Entertainment' , u'http://www.digitalspy.co.uk/rss/zones/gb/entertainment.xml')
,(u'General' , u'http://www.digitalspy.co.uk/rss/zones/gb/general.xml' )
,(u'Media' , u'http://www.digitalspy.co.uk/rss/zones/gb/media.xml' )
]

View File

@ -0,0 +1,38 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
elcomercio.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ElComercio(BasicNewsRecipe):
title = 'El Comercio '
__author__ = 'Darko Miletic'
description = "Gizmodo, the gadget guide. So much in love with shiny new toys, it's unnatural."
publisher = 'GRUPO EL COMERCIO C.A.'
category = 'news, Ecuador, politics'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = True
language = 'es'
masthead_url = 'http://ww1.elcomercio.com/nv_images/headers/EC/logo_new_08.gif'
extra_css = ' body{font-family: Arial,Verdana,sans-serif} img{margin-bottom: 1em} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_attributes = ['width','height']
feeds = [(u'Articles', u'http://ww1.elcomercio.com/rss/titulares1.xml')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -0,0 +1,40 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
gizmodo.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Gizmodo(BasicNewsRecipe):
title = 'Gizmodo'
__author__ = 'Darko Miletic'
description = "Gizmodo, the gadget guide. So much in love with shiny new toys, it's unnatural."
publisher = 'gizmodo.com'
category = 'news, IT, Internet, gadgets'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = True
language = 'en'
masthead_url = 'http://cache.gawkerassets.com/assets/gizmodo.com/img/logo.png'
extra_css = ' body{font-family: "Lucida Grande",Helvetica,Arial,sans-serif} img{margin-bottom: 1em} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_attributes = ['width','height']
remove_tags = [dict(name='div',attrs={'class':'feedflare'})]
remove_tags_after = dict(name='div',attrs={'class':'feedflare'})
feeds = [(u'Articles', u'http://feeds.gawker.com/gizmodo/full')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -18,7 +18,8 @@ class HBR(BasicNewsRecipe):
remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline', remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline',
'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
'mailingListTout', 'partnerCenter', 'pageFooter']), 'mailingListTout', 'partnerCenter', 'pageFooter',
'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
dict(name='iframe')] dict(name='iframe')]
extra_css = ''' extra_css = '''
a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; }

View File

@ -0,0 +1,47 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class SmeRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = 'Abelturd'
language = 'cz'
version = 1
title = u'iLiteratura.cz'
publisher = u''
category = u'News, Newspaper'
description = u'O LITERATU\u0158E V CEL\xc9M SV\u011aT\u011a A DOMA'
cover_url = 'http://www.iliteratura.cz/1_vzhled/1/iliteratura.gif'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
remove_empty_feeds = True
no_stylesheets = True
remove_javascript = True
feeds = []
feeds.append((u'\u010cl\xe1nky', u'http://www.iliteratura.cz/rss.asp'))
keep_only_tags = []
remove_tags = [dict(name='table'),dict(name='h3')]
preprocess_regexps = [
(re.compile(r'<h3>Souvisej.*</body>', re.DOTALL|re.IGNORECASE),
lambda match: ''),
]
def print_version(self, url):
m = re.search('(?<=ID=)[0-9]*', url)
return u'http://www.iliteratura.cz/clanek.asp?polozkaID=' + str(m.group(0)) + '&c=tisk'
extra_css = '''
h1 {font-size:140%;font-family:georgia,serif; font-weight:bold}
h3 {font-size:115%;font-family:georgia,serif; font-weight:bold}
'''

View File

@ -4,7 +4,7 @@ class Metro_Montreal(BasicNewsRecipe):
title = u'M\xe9tro Montr\xe9al' title = u'M\xe9tro Montr\xe9al'
__author__ = 'Jerry Clapperton' __author__ = 'Jerry Clapperton'
description = 'Le quotidien le plus branché sur le monde' description = 'Le quotidien le plus branch\xe9 sur le monde'
language = 'fr' language = 'fr'
oldest_article = 7 oldest_article = 7
@ -16,7 +16,7 @@ class Metro_Montreal(BasicNewsRecipe):
extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}' extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}'
remove_tags = [dict(attrs={'id':'buttons'})] remove_tags = [dict(attrs={'id':'buttons'})]
feeds = [ feeds = [
(u"L'info", u'http://journalmetro.com/linfo/rss'), (u"L'info", u'http://journalmetro.com/linfo/rss'),
(u'Monde', u'http://journalmetro.com/monde/rss'), (u'Monde', u'http://journalmetro.com/monde/rss'),
@ -26,4 +26,4 @@ class Metro_Montreal(BasicNewsRecipe):
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('article', 'ArticlePrint') + '?language=fr' return url.replace('article', 'ArticlePrint') + '?language=fr'

View File

@ -0,0 +1,35 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
www.nst.com.my
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Newstraitstimes(BasicNewsRecipe):
title = 'New Straits Times from Malaysia'
__author__ = 'Darko Miletic'
description = 'Learning Curve, Sunday People, New Straits Times from Malaysia'
publisher = 'nst.com.my'
category = 'news, politics, Malaysia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'en'
masthead_url = 'http://www.nst.com.my/Current_News/NST/Images/new-nstonline.jpg'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [dict(name=['link','table'])]
keep_only_tags = dict(name='div',attrs={'id':'haidah'})
feeds = [(u'Articles', u'http://www.nst.com.my/rss/allSec')]

View File

@ -1,13 +1,12 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
pagina12.com.ar pagina12.com.ar
''' '''
import time import re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Pagina12(BasicNewsRecipe): class Pagina12(BasicNewsRecipe):
title = 'Pagina - 12' title = 'Pagina - 12'
@ -22,7 +21,8 @@ class Pagina12(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
language = 'es' language = 'es'
remove_empty_feeds = True remove_empty_feeds = True
extra_css = ' body{font-family: sans-serif} ' masthead_url = 'http://www.pagina12.com.ar/commons/imgs/logo-home.gif'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } h2{color: #028CCD} img{margin-bottom: 0.4em} .epigrafe{font-size: x-small; background-color: #EBEAE5; color: #565144 } .intro{font-size: 1.1em} '
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
@ -52,7 +52,11 @@ class Pagina12(BasicNewsRecipe):
return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/') return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/')
def get_cover_url(self): def get_cover_url(self):
imgnames = ['tapan.jpg','tapagn.jpg','tapan_gr.jpg','tapagn.jpg','tapagn.jpg','tapan.jpg','tapagn.jpg'] rawc = self.index_to_soup('http://www.pagina12.com.ar/diario/principal/diario/index.html',True)
weekday = time.localtime().tm_wday rawc2 = re.sub(r'PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN','PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"',rawc)
return strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/') + imgnames[weekday] soup = BeautifulSoup(rawc2,fromEncoding=self.encoding,smartQuotesTo=None)
for image in soup.findAll('img',alt=True):
if image['alt'].startswith('Tapa de la fecha'):
return image['src']
return None

View File

@ -31,7 +31,7 @@ class PeopleMag(BasicNewsRecipe):
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class': 'panel_news_article_main'}), dict(name='div', attrs={'class': 'panel_news_article_main'}),
dict(name='div', attrs={'class':'article_content'}), dict(name='div', attrs={'class':'article_content'}),
dict(name='div', attrs={'class': 'headline'}), dict(name='div', attrs={'class': 'headline'}),
dict(name='div', attrs={'class': 'post'}), dict(name='div', attrs={'class': 'post'}),
@ -51,6 +51,7 @@ class PeopleMag(BasicNewsRecipe):
dict(name='div', attrs={'class':'sharelinkcont'}), dict(name='div', attrs={'class':'sharelinkcont'}),
dict(name='div', attrs={'class':'categories'}), dict(name='div', attrs={'class':'categories'}),
dict(name='ul', attrs={'class':'categories'}), dict(name='ul', attrs={'class':'categories'}),
dict(name='div', attrs={'class':'related_content'}),
dict(name='div', attrs={'id':'promo'}), dict(name='div', attrs={'id':'promo'}),
dict(name='div', attrs={'class':'linksWrapper'}), dict(name='div', attrs={'class':'linksWrapper'}),
dict(name='p', attrs={'class':'tag tvnews'}), dict(name='p', attrs={'class':'tag tvnews'}),

View File

@ -0,0 +1,64 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
readitlaterlist.com
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Readitlater(BasicNewsRecipe):
title = 'Read It Later'
__author__ = 'Darko Miletic'
description = '''Personalized news feeds. Go to readitlaterlist.com to
setup up your news. Fill in your account
username, and optionally you can add password.'''
publisher = 'readitlater.com'
category = 'news, custom'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
needs_subscription = True
INDEX = u'http://readitlaterlist.com'
LOGIN = INDEX + u'/l'
feeds = [(u'Unread articles' , INDEX + u'/unread')]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None:
br.open(self.LOGIN)
br.select_form(nr=0)
br['feed_id'] = self.username
if self.password is not None:
br['password'] = self.password
br.submit()
return br
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
ritem = soup.find('ul',attrs={'id':'list'})
for item in ritem.findAll('li'):
description = ''
atag = item.find('a',attrs={'class':'text'})
if atag and atag.has_key('href'):
url = self.INDEX + atag['href']
title = self.tag_to_string(item.div)
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
totalfeeds.append((feedtitle, articles))
return totalfeeds

View File

@ -1,22 +0,0 @@
from calibre.web.feeds.news import BasicNewsRecipe
class The_Gazette(BasicNewsRecipe):
cover_url = 'file:///D:/Documents/Pictures/Covers/The_Gazette.jpg'
title = u'The Gazette'
__author__ = 'Jerry Clapperton'
description = 'Montreal news in English'
language = 'en_CA'
oldest_article = 7
max_articles_per_feed = 20
use_embedded_content = False
remove_javascript = True
no_stylesheets = True
encoding = 'utf-8'
keep_only_tags = [dict(name='div', attrs={'id':['storyheader','page1']})]
extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}'
feeds = [(u'News', u'http://feeds.canada.com/canwest/F297'), (u'Opinion', u'http://feeds.canada.com/canwest/F7383'), (u'Arts', u'http://feeds.canada.com/canwest/F7366'), (u'Life', u'http://rss.canada.com/get/?F6934'), (u'Business', u'http://feeds.canada.com/canwest/F6939'), (u'Travel', u'http://rss.canada.com/get/?F6938'), (u'Health', u'http://feeds.canada.com/canwest/F7397'), (u'Technology', u'http://feeds.canada.com/canwest/F7411')]

View File

@ -9,6 +9,7 @@ class The_New_Republic(BasicNewsRecipe):
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':['print-logo', 'print-site_name', 'img-left', 'print-source_url']}), dict(name='div', attrs={'class':['print-logo', 'print-site_name', 'img-left', 'print-source_url']}),
@ -21,14 +22,15 @@ class The_New_Republic(BasicNewsRecipe):
('Economy', 'http://www.tnr.com/rss/articles/Economy'), ('Economy', 'http://www.tnr.com/rss/articles/Economy'),
('Environment and Energy', 'http://www.tnr.com/rss/articles/Environment-%2526-Energy'), ('Environment and Energy', 'http://www.tnr.com/rss/articles/Environment-%2526-Energy'),
('Health Care', 'http://www.tnr.com/rss/articles/Health-Care'), ('Health Care', 'http://www.tnr.com/rss/articles/Health-Care'),
('Urban Policy', 'http://www.tnr.com/rss/articles/Urban-Policy'), ('Metro Policy', 'http://www.tnr.com/rss/articles/Metro-Policy'),
('World', 'http://www.tnr.com/rss/articles/World'), ('World', 'http://www.tnr.com/rss/articles/World'),
('Film', 'http://www.tnr.com/rss/articles/Film'), ('Film', 'http://www.tnr.com/rss/articles/Film'),
('Books', 'http://www.tnr.com/rss/articles/books'), ('Books', 'http://www.tnr.com/rss/articles/books'),
('The Book', 'http://www.tnr.com/rss/book'),
('Jonathan Chait', 'http://www.tnr.com/rss/blogs/Jonathan-Chait'),
('The Plank', 'http://www.tnr.com/rss/blogs/The-Plank'), ('The Plank', 'http://www.tnr.com/rss/blogs/The-Plank'),
('The Treatment', 'http://www.tnr.com/rss/blogs/The-Treatment'), ('The Treatment', 'http://www.tnr.com/rss/blogs/The-Treatment'),
('The Spine', 'http://www.tnr.com/rss/blogs/The-Spine'), ('The Spine', 'http://www.tnr.com/rss/blogs/The-Spine'),
('The Stash', 'http://www.tnr.com/rss/blogs/The-Stash'),
('The Vine', 'http://www.tnr.com/rss/blogs/The-Vine'), ('The Vine', 'http://www.tnr.com/rss/blogs/The-Vine'),
('The Avenue', 'http://www.tnr.com/rss/blogs/The-Avenue'), ('The Avenue', 'http://www.tnr.com/rss/blogs/The-Avenue'),
('William Galston', 'http://www.tnr.com/rss/blogs/William-Galston'), ('William Galston', 'http://www.tnr.com/rss/blogs/William-Galston'),
@ -40,3 +42,4 @@ class The_New_Republic(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url.replace('http://www.tnr.com/', 'http://www.tnr.com/print/') return url.replace('http://www.tnr.com/', 'http://www.tnr.com/print/')

View File

@ -0,0 +1,53 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
db.tidbits.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class TidBITS(BasicNewsRecipe):
title = 'TidBITS: Mac News for the Rest of Us'
__author__ = 'Darko Miletic'
description = 'Insightful news, reviews, and analysis of the Macintosh and Internet worlds'
publisher = 'TidBITS Publishing Inc.'
category = 'news, Apple, Macintosh, IT, Internet'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = True
language = 'en'
remove_empty_feeds = True
masthead_url = 'http://db.tidbits.com/images/tblogo9.gif'
extra_css = ' body{font-family: Georgia,"Times New Roman",Times,serif} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_attributes = ['width','height']
remove_tags = [dict(name='small')]
remove_tags_after = dict(name='small')
feeds = [
(u'Business Apps' , u'http://db.tidbits.com/feeds/business.rss' )
,(u'Entertainment' , u'http://db.tidbits.com/feeds/entertainment.rss')
,(u'External Links' , u'http://db.tidbits.com/feeds/links.rss' )
,(u'Home Mac' , u'http://db.tidbits.com/feeds/home.rss' )
,(u'Inside TidBITS' , u'http://db.tidbits.com/feeds/inside.rss' )
,(u'iPod & iPhone' , u'http://db.tidbits.com/feeds/ipod-iphone.rss' )
,(u'Just for Fun' , u'http://db.tidbits.com/feeds/fun.rss' )
,(u'Macs & Mac OS X' , u'http://db.tidbits.com/feeds/macs.rss' )
,(u'Media Creation' , u'http://db.tidbits.com/feeds/creative.rss' )
,(u'Networking & Communications', u'http://db.tidbits.com/feeds/net.rss' )
,(u'Opinion & Editorial' , u'http://db.tidbits.com/feeds/opinion.rss' )
,(u'Support & Problem Solving' , u'http://db.tidbits.com/feeds/support.rss' )
,(u'Safe Computing' , u'http://db.tidbits.com/feeds/security.rss' )
,(u'Tech News' , u'http://db.tidbits.com/feeds/tech.rss' )
,(u'Software Watchlist' , u'http://db.tidbits.com/feeds/watchlist.rss' )
]

View File

@ -215,7 +215,7 @@ class WSJ(BasicNewsRecipe):
# first, check if there is an h3 tag which provides a section name # first, check if there is an h3 tag which provides a section name
stag = divtag.find('h3') stag = divtag.find('h3')
if stag: if stag:
if stag.parent['class'] == 'dynamic': if stag.parent.get('class', '') == 'dynamic':
# a carousel of articles is too complex to extract a section name # a carousel of articles is too complex to extract a section name
# for each article, so we'll just call the section "Carousel" # for each article, so we'll just call the section "Carousel"
section_name = 'Carousel' section_name = 'Carousel'

View File

@ -48,7 +48,9 @@ class Resources(Command):
dest = self.j(self.RESOURCES, 'builtin_recipes.xml') dest = self.j(self.RESOURCES, 'builtin_recipes.xml')
if self.newer(dest, files): if self.newer(dest, files):
self.info('\tCreating builtin_recipes.xml') self.info('\tCreating builtin_recipes.xml')
open(dest, 'wb').write(serialize_builtin_recipes()) xml = serialize_builtin_recipes()
with open(dest, 'wb') as f:
f.write(xml)
dest = self.j(self.RESOURCES, 'ebook-convert-complete.pickle') dest = self.j(self.RESOURCES, 'ebook-convert-complete.pickle')
files = [] files = []

View File

@ -262,7 +262,6 @@ class Region(object):
max_lines = max(max_lines, len(c)) max_lines = max(max_lines, len(c))
return max_lines return max_lines
@property @property
def is_small(self): def is_small(self):
return self.line_count < 3 return self.line_count < 3
@ -438,9 +437,8 @@ class Page(object):
# absorb into a neighboring region (prefer the one with number of cols # absorb into a neighboring region (prefer the one with number of cols
# closer to the avg number of cols in the set, if equal use larger # closer to the avg number of cols in the set, if equal use larger
# region) # region)
# merge contiguous regions that can contain each other
'''absorbed = set([])
found = True found = True
absorbed = set([])
while found: while found:
found = False found = False
for i, region in enumerate(self.regions): for i, region in enumerate(self.regions):
@ -452,10 +450,33 @@ class Page(object):
regions.append(self.regions[j]) regions.append(self.regions[j])
else: else:
break break
prev = None if i == 0 else i-1 prev_region = None if i == 0 else i-1
next = j if self.regions[j] not in regions else None next_region = j if self.regions[j] not in regions else None
''' if prev_region is None and next_region is not None:
pass absorb_into = next_region
elif next_region is None and prev_region is not None:
absorb_into = prev_region
elif prev_region is None and next_region is None:
if len(regions) > 1:
absorb_into = regions[0]
regions = regions[1:]
else:
absorb_into = None
else:
absorb_into = prev_region
if next_region.line_count >= prev_region.line_count:
avg_column_count = sum([len(r.columns) for r in
regions])/float(len(regions))
if next_region.line_count > prev_region.line_count \
or abs(avg_column_count - len(prev_region.columns)) \
> abs(avg_column_count - len(next_region.columns)):
absorb_into = next_region
if absorb_into is not None:
absorb_into.absorb_region(regions)
absorbed.update(regions)
i = j
for region in absorbed:
self.regions.remove(region)

View File

@ -72,7 +72,7 @@ class Tokenize:
return line return line
def __compile_expressions(self): def __compile_expressions(self):
self.__ms_hex_exp = re.compile(r"\\\'(..)") self.__ms_hex_exp = re.compile(r"\\\'(..)")
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") self.__utf_exp = re.compile(r"\\u(-?\d{3,6})")
self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)") self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
self.__par_exp = re.compile(r'\\$') self.__par_exp = re.compile(r'\\$')
self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")

View File

@ -80,7 +80,7 @@
<widget class="QLabel" name="label_6"> <widget class="QLabel" name="label_6">
<property name="text"> <property name="text">
<string>Regex tips: <string>Regex tips:
- The default regex - \[[\w]*\] - excludes genre tags of the form [tag], e.g., [Amazon Freebie] - The default regex - \[[\w ]*\] - excludes genre tags of the form [tag], e.g., [Amazon Freebie]
- A regex pattern of a single dot excludes all genre tags, generating no Genre Section</string> - A regex pattern of a single dot excludes all genre tags, generating no Genre Section</string>
</property> </property>
<property name="wordWrap"> <property name="wordWrap">

View File

@ -57,7 +57,8 @@ def gui_catalog(fmt, title, dbspec, ids, out_file_name, sync, fmt_options,
setattr(opts,option, fmt_options[option]) setattr(opts,option, fmt_options[option])
# Fetch and run the plugin for fmt # Fetch and run the plugin for fmt
# Returns 0 if successful, 1 if no catalog built
plugin = plugin_for_catalog_format(fmt) plugin = plugin_for_catalog_format(fmt)
plugin.run(out_file_name, opts, db, notification=notification) return plugin.run(out_file_name, opts, db, notification=notification)

View File

@ -149,7 +149,7 @@ class DeviceManager(Thread):
possibly_connected_devices.append((device, detected_device)) possibly_connected_devices.append((device, detected_device))
if possibly_connected_devices: if possibly_connected_devices:
if not self.do_connect(possibly_connected_devices): if not self.do_connect(possibly_connected_devices):
print 'Connect to device failed, retying in 5 seconds...' print 'Connect to device failed, retrying in 5 seconds...'
time.sleep(5) time.sleep(5)
if not self.do_connect(possibly_connected_devices): if not self.do_connect(possibly_connected_devices):
print 'Device connect failed again, giving up' print 'Device connect failed again, giving up'

View File

@ -594,6 +594,11 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
self.rating.setValue(int(book.rating)) self.rating.setValue(int(book.rating))
if book.tags: if book.tags:
self.tags.setText(', '.join(book.tags)) self.tags.setText(', '.join(book.tags))
if book.series is not None:
if self.series.text() is None or self.series.text() == '':
self.series.setText(book.series)
if book.series_index is not None:
self.series_index.setValue(book.series_index)
else: else:
error_dialog(self, _('Cannot fetch metadata'), error_dialog(self, _('Cannot fetch metadata'),
_('You must specify at least one of ISBN, Title, ' _('You must specify at least one of ISBN, Title, '

View File

@ -903,9 +903,13 @@ class OnDeviceSearch(SearchQueryParser):
locations[i] = q[v] locations[i] = q[v]
for i, r in enumerate(self.model.db): for i, r in enumerate(self.model.db):
for loc in locations: for loc in locations:
if query in loc(r): try:
matches.add(i) if query in loc(r):
break matches.add(i)
break
except ValueError: # Unicode errors
import traceback
traceback.print_exc()
return matches return matches

View File

@ -1394,6 +1394,11 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
self.status_bar.showMessage(_('Generating %s catalog...')%fmt) self.status_bar.showMessage(_('Generating %s catalog...')%fmt)
def catalog_generated(self, job): def catalog_generated(self, job):
if job.result:
# Search terms nulled catalog results
return error_dialog(self, _('No books found'),
_("No books to catalog\nCheck exclude tags"),
show=True)
if job.failed: if job.failed:
return self.job_exception(job) return self.job_exception(job)
id = self.library_view.model().add_catalog(job.catalog_file_path, job.catalog_title) id = self.library_view.model().add_catalog(job.catalog_file_path, job.catalog_title)

View File

@ -927,8 +927,16 @@ class EPUB_MOBI(CatalogPlugin):
for record in data: for record in data:
this_title = {} this_title = {}
title = this_title['title'] = self.convertHTMLEntities(record['title']) this_title['title'] = self.convertHTMLEntities(record['title'])
this_title['title_sort'] = self.generateSortTitle(title) if record['series']:
this_title['series'] = record['series']
this_title['series_index'] = record['series_index']
this_title['title'] = self.generateSeriesTitle(this_title)
else:
this_title['series'] = None
this_title['series_index'] = 0.0
this_title['title_sort'] = self.generateSortTitle(this_title['title'])
if 'authors' in record and len(record['authors']): if 'authors' in record and len(record['authors']):
this_title['author'] = " &amp; ".join(record['authors']) this_title['author'] = " &amp; ".join(record['authors'])
else: else:
@ -984,12 +992,59 @@ class EPUB_MOBI(CatalogPlugin):
def fetchBooksByAuthor(self): def fetchBooksByAuthor(self):
# Generate a list of titles sorted by author from the database # Generate a list of titles sorted by author from the database
def author_compare(x,y):
# Return -1 if x<y
# Return 0 if x==y
# Return 1 if x>y
# Different authors - sort by author_sort
if x['author_sort'] > y['author_sort']:
return 1
elif x['author_sort'] < y['author_sort']:
return -1
else:
# Same author
if x['series'] != y['series']:
# Different series
if x['title_sort'].lstrip() > y['title_sort'].lstrip():
return 1
else:
return -1
else:
# Same series
if x['series'] == y['series']:
if float(x['series_index']) > float(y['series_index']):
return 1
elif float(x['series_index']) < float(y['series_index']):
return -1
else:
return 0
else:
if x['series'] > y['series']:
return 1
else:
return -1
self.updateProgressFullStep("Sorting database") self.updateProgressFullStep("Sorting database")
# Sort titles case-insensitive '''
# Sort titles case-insensitive, by author
self.booksByAuthor = sorted(self.booksByTitle, self.booksByAuthor = sorted(self.booksByTitle,
key=lambda x:(x['author_sort'].upper(), x['author_sort'].upper())) key=lambda x:(x['author_sort'].upper(), x['author_sort'].upper()))
'''
self.booksByAuthor = list(self.booksByTitle)
self.booksByAuthor.sort(author_compare)
if False and self.verbose:
self.opts.log.info("fetchBooksByAuthor(): %d books" % len(self.booksByAuthor))
self.opts.log.info(" %-30s %-20s %s" % ('title', 'title_sort','series', 'series_index'))
for title in self.booksByAuthor:
self.opts.log.info((u" %-30s %-20s %-20s%5s " % \
(title['title'][:30],
title['series'][:20] if title['series'] else '',
title['series_index'],
)).encode('utf-8'))
raise SystemExit
# Build the unique_authors set from existing data # Build the unique_authors set from existing data
authors = [(record['author'], record['author_sort']) for record in self.booksByAuthor] authors = [(record['author'], record['author_sort']) for record in self.booksByAuthor]
@ -1063,7 +1118,17 @@ class EPUB_MOBI(CatalogPlugin):
# Insert the book title # Insert the book title
#<p class="title"><a name="<database_id>"></a><em>Book Title</em></p> #<p class="title"><a name="<database_id>"></a><em>Book Title</em></p>
emTag = Tag(soup, "em") emTag = Tag(soup, "em")
emTag.insert(0, NavigableString(escape(title['title']))) if title['series']:
# title<br />series series_index
brTag = Tag(soup,'br')
title_tokens = title['title'].split(': ')
emTag.insert(0, NavigableString(title_tokens[1]))
emTag.insert(1, brTag)
smallTag = Tag(soup,'small')
smallTag.insert(0,NavigableString(title_tokens[0]))
emTag.insert(2, smallTag)
else:
emTag.insert(0, NavigableString(escape(title['title'])))
titleTag = body.find(attrs={'class':'title'}) titleTag = body.find(attrs={'class':'title'})
titleTag.insert(0,emTag) titleTag.insert(0,emTag)
@ -1073,7 +1138,12 @@ class EPUB_MOBI(CatalogPlugin):
aTag['href'] = "%s.html#%s" % ("ByAlphaAuthor", self.generateAuthorAnchor(title['author'])) aTag['href'] = "%s.html#%s" % ("ByAlphaAuthor", self.generateAuthorAnchor(title['author']))
#aTag.insert(0, escape(title['author'])) #aTag.insert(0, escape(title['author']))
aTag.insert(0, title['author']) aTag.insert(0, title['author'])
authorTag.insert(0, NavigableString("by "))
# Insert READ_SYMBOL
if title['read']:
authorTag.insert(0, NavigableString(self.READ_SYMBOL + "by "))
else:
authorTag.insert(0, NavigableString(self.NOT_READ_SYMBOL + "by "))
authorTag.insert(1, aTag) authorTag.insert(1, aTag)
''' '''
@ -1085,6 +1155,27 @@ class EPUB_MOBI(CatalogPlugin):
tagsTag.insert(0,emTag) tagsTag.insert(0,emTag)
''' '''
'''
# Insert Series info or remove.
seriesTag = body.find(attrs={'class':'series'})
if title['series']:
# Insert a spacer to match the author indent
stc = 0
fontTag = Tag(soup,"font")
fontTag['style'] = 'color:white;font-size:large'
if self.opts.fmt == 'epub':
fontTag['style'] += ';opacity: 0.0'
fontTag.insert(0, NavigableString("by "))
seriesTag.insert(stc, fontTag)
stc += 1
if float(title['series_index']) - int(title['series_index']):
series_str = 'Series: %s [%4.2f]' % (title['series'], title['series_index'])
else:
series_str = '%s [%d]' % (title['series'], title['series_index'])
seriesTag.insert(stc,NavigableString(series_str))
else:
seriesTag.extract()
'''
# Insert linked genres # Insert linked genres
if 'tags' in title: if 'tags' in title:
tagsTag = body.find(attrs={'class':'tags'}) tagsTag = body.find(attrs={'class':'tags'})
@ -1118,7 +1209,12 @@ class EPUB_MOBI(CatalogPlugin):
else: else:
imgTag['src'] = "../images/thumbnail_default.jpg" imgTag['src'] = "../images/thumbnail_default.jpg"
imgTag['alt'] = "cover" imgTag['alt'] = "cover"
imgTag['style'] = 'width: %dpx; height:%dpx;' % (self.THUMB_WIDTH, self.THUMB_HEIGHT)
# Tweak image size if we're building for Sony, not sure why this is needed
if self.opts.fmt == 'epub' and self.opts.output_profile.startswith("sony"):
imgTag['style'] = 'width: %dpx; height:%dpx;' % (self.THUMB_WIDTH * 2, self.THUMB_HEIGHT * 2)
else:
imgTag['style'] = 'width: %dpx; height:%dpx;' % (self.THUMB_WIDTH, self.THUMB_HEIGHT)
thumbnailTag = body.find(attrs={'class':'thumbnail'}) thumbnailTag = body.find(attrs={'class':'thumbnail'})
thumbnailTag.insert(0,imgTag) thumbnailTag.insert(0,imgTag)
@ -1310,8 +1406,9 @@ class EPUB_MOBI(CatalogPlugin):
dtc = 0 dtc = 0
current_letter = "" current_letter = ""
current_author = "" current_author = ""
current_series = None
# Loop through books_by_author # Loop through booksByAuthor
book_count = 0 book_count = 0
for book in self.booksByAuthor: for book in self.booksByAuthor:
book_count += 1 book_count += 1
@ -1349,11 +1446,23 @@ class EPUB_MOBI(CatalogPlugin):
divTag.insert(dtc,pAuthorTag) divTag.insert(dtc,pAuthorTag)
dtc += 1 dtc += 1
# Check for series
if book['series'] and book['series'] != current_series:
# Start a new series
current_series = book['series']
pSeriesTag = Tag(soup,'p')
pSeriesTag['class'] = "series"
pSeriesTag.insert(0,NavigableString(self.NOT_READ_SYMBOL + book['series']))
divTag.insert(dtc,pSeriesTag)
dtc += 1
if current_series and not book['series']:
current_series = None
# Add books # Add books
pBookTag = Tag(soup, "p") pBookTag = Tag(soup, "p")
ptc = 0 ptc = 0
# Prefix book with read/unread symbol # book with read/unread symbol
if book['read']: if book['read']:
# check mark # check mark
pBookTag.insert(ptc,NavigableString(self.READ_SYMBOL)) pBookTag.insert(ptc,NavigableString(self.READ_SYMBOL))
@ -1367,7 +1476,11 @@ class EPUB_MOBI(CatalogPlugin):
aTag = Tag(soup, "a") aTag = Tag(soup, "a")
aTag['href'] = "book_%d.html" % (int(float(book['id']))) aTag['href'] = "book_%d.html" % (int(float(book['id'])))
aTag.insert(0,escape(book['title'])) # Use series, series index if avail else just title
if current_series:
aTag.insert(0,escape(book['title'][len(book['series'])+1:]))
else:
aTag.insert(0,escape(book['title']))
pBookTag.insert(ptc, aTag) pBookTag.insert(ptc, aTag)
ptc += 1 ptc += 1
@ -1419,6 +1532,7 @@ class EPUB_MOBI(CatalogPlugin):
divTag.insert(dtc,pIndexTag) divTag.insert(dtc,pIndexTag)
dtc += 1 dtc += 1
current_author = None current_author = None
current_series = None
for new_entry in this_months_list: for new_entry in this_months_list:
if new_entry['author'] != current_author: if new_entry['author'] != current_author:
@ -1435,6 +1549,18 @@ class EPUB_MOBI(CatalogPlugin):
divTag.insert(dtc,pAuthorTag) divTag.insert(dtc,pAuthorTag)
dtc += 1 dtc += 1
# Check for series
if new_entry['series'] and new_entry['series'] != current_series:
# Start a new series
current_series = new_entry['series']
pSeriesTag = Tag(soup,'p')
pSeriesTag['class'] = "series"
pSeriesTag.insert(0,NavigableString(self.NOT_READ_SYMBOL + new_entry['series']))
divTag.insert(dtc,pSeriesTag)
dtc += 1
if current_series and not new_entry['series']:
current_series = None
# Add books # Add books
pBookTag = Tag(soup, "p") pBookTag = Tag(soup, "p")
ptc = 0 ptc = 0
@ -1453,7 +1579,10 @@ class EPUB_MOBI(CatalogPlugin):
aTag = Tag(soup, "a") aTag = Tag(soup, "a")
aTag['href'] = "book_%d.html" % (int(float(new_entry['id']))) aTag['href'] = "book_%d.html" % (int(float(new_entry['id'])))
aTag.insert(0,escape(new_entry['title'])) if current_series:
aTag.insert(0,escape(new_entry['title'][len(new_entry['series'])+1:]))
else:
aTag.insert(0,escape(new_entry['title']))
pBookTag.insert(ptc, aTag) pBookTag.insert(ptc, aTag)
ptc += 1 ptc += 1
@ -1554,6 +1683,7 @@ class EPUB_MOBI(CatalogPlugin):
this_book['author_sort'] = book['author_sort'] this_book['author_sort'] = book['author_sort']
this_book['read'] = book['read'] this_book['read'] = book['read']
this_book['id'] = book['id'] this_book['id'] = book['id']
this_book['series'] = book['series']
normalized_tag = self.genre_tags_dict[friendly_tag] normalized_tag = self.genre_tags_dict[friendly_tag]
genre_tag_list = [key for genre in genre_list for key in genre] genre_tag_list = [key for genre in genre_list for key in genre]
if normalized_tag in genre_tag_list: if normalized_tag in genre_tag_list:
@ -1579,7 +1709,9 @@ class EPUB_MOBI(CatalogPlugin):
for genre in genre_list: for genre in genre_list:
for key in genre: for key in genre:
self.opts.log.info(" %s: %d titles" % (key, len(genre[key]))) self.opts.log.info(" %s: %d %s" % (self.getFriendlyGenreTag(key),
len(genre[key]),
'titles' if len(genre[key]) > 1 else 'title'))
# Write the results # Write the results
# genre_list = [ {friendly_tag:[{book},{book}]}, {friendly_tag:[{book},{book}]}, ...] # genre_list = [ {friendly_tag:[{book},{book}]}, {friendly_tag:[{book},{book}]}, ...]
@ -1786,7 +1918,9 @@ class EPUB_MOBI(CatalogPlugin):
mtc += 1 mtc += 1
# HTML files - add books to manifest and spine # HTML files - add books to manifest and spine
for book in self.booksByTitle: sort_descriptions_by = self.booksByAuthor if self.opts.sort_descriptions_by_author \
else self.booksByTitle
for book in sort_descriptions_by:
# manifest # manifest
itemTag = Tag(soup, "item") itemTag = Tag(soup, "item")
itemTag['href'] = "content/book_%d.html" % int(book['id']) itemTag['href'] = "content/book_%d.html" % int(book['id'])
@ -1912,7 +2046,9 @@ class EPUB_MOBI(CatalogPlugin):
nptc += 1 nptc += 1
# Loop over the titles # Loop over the titles
for book in self.booksByTitle: sort_descriptions_by = self.booksByAuthor if self.opts.sort_descriptions_by_author \
else self.booksByTitle
for book in sort_descriptions_by:
navPointVolumeTag = Tag(ncx_soup, 'navPoint') navPointVolumeTag = Tag(ncx_soup, 'navPoint')
navPointVolumeTag['class'] = "article" navPointVolumeTag['class'] = "article"
navPointVolumeTag['id'] = "book%dID" % int(book['id']) navPointVolumeTag['id'] = "book%dID" % int(book['id'])
@ -1920,7 +2056,11 @@ class EPUB_MOBI(CatalogPlugin):
self.playOrder += 1 self.playOrder += 1
navLabelTag = Tag(ncx_soup, "navLabel") navLabelTag = Tag(ncx_soup, "navLabel")
textTag = Tag(ncx_soup, "text") textTag = Tag(ncx_soup, "text")
textTag.insert(0, NavigableString(self.formatNCXText(book['title']))) if book['series']:
tokens = book['title'].split(': ')
textTag.insert(0, NavigableString(self.formatNCXText('%s (%s)' % (tokens[1], tokens[0]))))
else:
textTag.insert(0, NavigableString(self.formatNCXText(book['title'])))
navLabelTag.insert(0,textTag) navLabelTag.insert(0,textTag)
navPointVolumeTag.insert(0,navLabelTag) navPointVolumeTag.insert(0,navLabelTag)
@ -2426,15 +2566,25 @@ class EPUB_MOBI(CatalogPlugin):
else: else:
yield tag yield tag
self.opts.log.info(u' %d available genre tags in database (exclude_genre: %s):' % \ self.opts.log.info(u' %d genre tags in database (excluding genres matching %s):' % \
(len(genre_tags_dict), self.opts.exclude_genre)) (len(genre_tags_dict), self.opts.exclude_genre))
# Display friendly/normalized genres # Display friendly/normalized genres
# friendly => normalized # friendly => normalized
sorted_tags = ['%s => %s' % (key, genre_tags_dict[key]) for key in sorted(genre_tags_dict.keys())] if False:
sorted_tags = ['%s => %s' % (key, genre_tags_dict[key]) for key in sorted(genre_tags_dict.keys())]
for tag in next_tag(sorted_tags): for tag in next_tag(sorted_tags):
self.opts.log(u' %s' % tag) self.opts.log(u' %s' % tag)
else:
sorted_tags = ['%s' % (key) for key in sorted(genre_tags_dict.keys())]
out_str = ''
line_break = 70
for tag in next_tag(sorted_tags):
out_str += tag
if len(out_str) >= line_break:
self.opts.log.info(' %s' % out_str)
out_str = ''
self.opts.log.info(' %s' % out_str)
return genre_tags_dict return genre_tags_dict
@ -2474,19 +2624,15 @@ class EPUB_MOBI(CatalogPlugin):
body.insert(btc,aTag) body.insert(btc,aTag)
btc += 1 btc += 1
# Find the first instance of friendly_tag matching genre
for friendly_tag in self.genre_tags_dict:
if self.genre_tags_dict[friendly_tag] == genre:
break
titleTag = body.find(attrs={'class':'title'}) titleTag = body.find(attrs={'class':'title'})
titleTag.insert(0,NavigableString('<b><i>%s</i></b>' % escape(friendly_tag))) titleTag.insert(0,NavigableString('<b><i>%s</i></b>' % escape(self.getFriendlyGenreTag(genre))))
# Insert the books by author list # Insert the books by author list
divTag = body.find(attrs={'class':'authors'}) divTag = body.find(attrs={'class':'authors'})
dtc = 0 dtc = 0
current_author = '' current_author = ''
current_series = None
for book in books: for book in books:
if book['author'] != current_author: if book['author'] != current_author:
# Start a new author with link # Start a new author with link
@ -2502,6 +2648,19 @@ class EPUB_MOBI(CatalogPlugin):
divTag.insert(dtc,pAuthorTag) divTag.insert(dtc,pAuthorTag)
dtc += 1 dtc += 1
# Check for series
if book['series'] and book['series'] != current_series:
# Start a new series
current_series = book['series']
pSeriesTag = Tag(soup,'p')
pSeriesTag['class'] = "series"
pSeriesTag.insert(0,NavigableString(self.NOT_READ_SYMBOL + book['series']))
divTag.insert(dtc,pSeriesTag)
dtc += 1
if current_series and not book['series']:
current_series = None
# Add books # Add books
pBookTag = Tag(soup, "p") pBookTag = Tag(soup, "p")
ptc = 0 ptc = 0
@ -2518,7 +2677,11 @@ class EPUB_MOBI(CatalogPlugin):
# Add the book title # Add the book title
aTag = Tag(soup, "a") aTag = Tag(soup, "a")
aTag['href'] = "book_%d.html" % (int(float(book['id']))) aTag['href'] = "book_%d.html" % (int(float(book['id'])))
aTag.insert(0,escape(book['title'])) # Use series, series index if avail else just title
if current_series:
aTag.insert(0,escape(book['title'][len(book['series'])+1:]))
else:
aTag.insert(0,escape(book['title']))
pBookTag.insert(ptc, aTag) pBookTag.insert(ptc, aTag)
ptc += 1 ptc += 1
@ -2553,6 +2716,7 @@ class EPUB_MOBI(CatalogPlugin):
<p class="title"></p> <p class="title"></p>
{0} {0}
<p class="author"></p> <p class="author"></p>
<!--p class="series"></p-->
<p class="tags">&nbsp;</p> <p class="tags">&nbsp;</p>
<table width="100%" border="0"> <table width="100%" border="0">
<tr> <tr>
@ -2678,6 +2842,17 @@ class EPUB_MOBI(CatalogPlugin):
draw.text((left, top), text, fill=(0,0,0), font=font) draw.text((left, top), text, fill=(0,0,0), font=font)
img.save(open(out_path, 'wb'), 'GIF') img.save(open(out_path, 'wb'), 'GIF')
def generateSeriesTitle(self, title):
if float(title['series_index']) - int(title['series_index']):
series_title = '%s %4.2f: %s' % (title['series'],
title['series_index'],
title['title'])
else:
series_title = '%s %d: %s' % (title['series'],
title['series_index'],
title['title'])
return series_title
def generateShortDescription(self, description): def generateShortDescription(self, description):
# Truncate the description to description_clip, on word boundaries if necessary # Truncate the description to description_clip, on word boundaries if necessary
if not description: if not description:
@ -2775,33 +2950,115 @@ class EPUB_MOBI(CatalogPlugin):
else: else:
return char return char
def getFriendlyGenreTag(self, genre):
# Find the first instance of friendly_tag matching genre
for friendly_tag in self.genre_tags_dict:
if self.genre_tags_dict[friendly_tag] == genre:
return friendly_tag
def markdownComments(self, comments): def markdownComments(self, comments):
''' Convert random comment text to normalized, xml-legal block of <p>s''' '''
# reformat illegal xml Convert random comment text to normalized, xml-legal block of <p>s
desc = prepare_string_for_xml(comments) 'plain text' returns as
<p>plain text</p>
# normalize <br/> tags 'plain text with <i>minimal</i> <b>markup</b>' returns as
desc = re.sub(r'&lt;br[/]{0,1}&gt;', '<br/>', desc) <p>plain text with <i>minimal</i> <b>markup</b></p>
# tokenize double line breaks '<p>pre-formatted text</p> returns untouched
desc = comments.replace('\r', '')
tokens = comments.split('\n\n')
soup = BeautifulSoup() 'A line of text\n\nFollowed by a line of text' returns as
ptc = 0 <p>A line of text</p>
for token in tokens: <p>Followed by a line of text</p>
pTag = Tag(soup, 'p')
pTag.insert(0,token) 'A line of text.\nA second line of text.\rA third line of text' returns as
soup.insert(ptc, pTag) <p>A line of text.<br />A second line of text.<br />A third line of text.</p>
ptc += 1
return soup.renderContents(encoding=None) '...end of a paragraph.Somehow the break was lost...' returns as
<p>...end of a paragraph.</p>
<p>Somehow the break was lost...</p>
Deprecated HTML returns as HTML via BeautifulSoup()
'''
# Explode lost CRs to \n\n
# Hackish - ignoring sentences ending or beginning in numbers to avoid
# confusion with decimal points.
for lost_cr in re.finditer('([a-z])([\.\?!])([A-Z])',comments):
comments = comments.replace(lost_cr.group(),
'%s%s\n\n%s' % (lost_cr.group(1),
lost_cr.group(2),
lost_cr.group(3)))
# Convert \n\n to <p>s
if re.search('\n\n', comments):
soup = BeautifulSoup()
split_ps = comments.split('\n\n')
tsc = 0
for p in split_ps:
pTag = Tag(soup,'p')
pTag.insert(0,p)
soup.insert(tsc,pTag)
tsc += 1
comments = soup.renderContents()
# Convert solo returns to <br />
comments = re.sub('[\r\n]','<br />', comments)
soup = BeautifulSoup(comments)
result = BeautifulSoup()
rtc = 0
open_pTag = False
all_tokens = list(soup.contents)
for token in all_tokens:
if type(token) is NavigableString:
if not open_pTag:
pTag = Tag(result,'p')
open_pTag = True
ptc = 0
pTag.insert(ptc,prepare_string_for_xml(token))
ptc += 1
elif token.name in ['br','b','i']:
if not open_pTag:
pTag = Tag(result,'p')
open_pTag = True
ptc = 0
pTag.insert(ptc, token)
ptc += 1
else:
if open_pTag:
result.insert(rtc, pTag)
rtc += 1
open_pTag = False
ptc = 0
# Clean up NavigableStrings for xml
sub_tokens = list(token.contents)
for sub_token in sub_tokens:
if type(sub_token) is NavigableString:
sub_token.replaceWith(prepare_string_for_xml(sub_token))
result.insert(rtc, token)
rtc += 1
if open_pTag:
result.insert(rtc, pTag)
paras = result.findAll('p')
for p in paras:
p['class'] = 'description'
return result.renderContents(encoding=None)
def processSpecialTags(self, tags, this_title, opts): def processSpecialTags(self, tags, this_title, opts):
tag_list = [] tag_list = []
for tag in tags: for tag in tags:
tag = self.convertHTMLEntities(tag) tag = self.convertHTMLEntities(tag)
if tag.startswith(opts.note_tag): if tag.startswith(opts.note_tag):
this_title['notes'] = tag[1:] this_title['notes'] = tag[len(self.opts.note_tag):]
elif tag == opts.read_tag: elif tag == opts.read_tag:
this_title['read'] = True this_title['read'] = True
elif re.search(opts.exclude_genre, tag): elif re.search(opts.exclude_genre, tag):
@ -2847,6 +3104,8 @@ class EPUB_MOBI(CatalogPlugin):
opts.basename = "Catalog" opts.basename = "Catalog"
opts.plugin_path = self.plugin_path opts.plugin_path = self.plugin_path
opts.cli_environment = not hasattr(opts,'sync') opts.cli_environment = not hasattr(opts,'sync')
# GwR *** hardwired to sort by author, could be an option if passed in opts
opts.sort_descriptions_by_author = True
if opts.verbose: if opts.verbose:
opts_dict = vars(opts) opts_dict = vars(opts)
@ -2855,15 +3114,30 @@ class EPUB_MOBI(CatalogPlugin):
'CLI' if opts.cli_environment else 'GUI')) 'CLI' if opts.cli_environment else 'GUI'))
if opts_dict['ids']: if opts_dict['ids']:
log(" Book count: %d" % len(opts_dict['ids'])) log(" Book count: %d" % len(opts_dict['ids']))
sections_list = ['Descriptions','Authors']
if opts.generate_titles:
sections_list.append('Titles')
if opts.generate_recently_added:
sections_list.append('Recently Added')
if not opts.exclude_genre.strip() == '.':
sections_list.append('Genres')
log(u"Creating Sections for %s" % ', '.join(sections_list))
# If exclude_genre is blank, assume user wants all genre tags included
if opts.exclude_genre.strip() == '':
opts.exclude_genre = '\[^.\]'
log(" converting empty exclude_genre to '\[^.\]'")
# Display opts # Display opts
keys = opts_dict.keys() keys = opts_dict.keys()
keys.sort() keys.sort()
log(" opts:") log(" opts:")
for key in keys: for key in keys:
if key in ['catalog_title','exclude_genre','exclude_tags','generate_titles', if key in ['catalog_title','exclude_genre','exclude_tags',
'generate_recently_added','note_tag','numbers_as_text','read_tag', 'note_tag','numbers_as_text','read_tag',
'search_text','sort_by','sync']: 'search_text','sort_by','sort_descriptions_by_author','sync']:
log(" %s: %s" % (key, opts_dict[key])) log(" %s: %s" % (key, opts_dict[key]))
# Launch the Catalog builder # Launch the Catalog builder

View File

@ -62,7 +62,7 @@ How do I convert my file containing non-English characters, or smart quotes?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There are two aspects to this problem: There are two aspects to this problem:
1. Knowing the encoding of the source file: |app| tries to guess what character encoding your source files use, but often, this is impossible, so you need to tell it what encoding to use. This can be done in the GUI via the :guilabel:`Input character encoding` field in the :guilabel:`Look & Feel` section. The command-line tools all have an :option:`--input-encoding` option. 1. Knowing the encoding of the source file: |app| tries to guess what character encoding your source files use, but often, this is impossible, so you need to tell it what encoding to use. This can be done in the GUI via the :guilabel:`Input character encoding` field in the :guilabel:`Look & Feel` section. The command-line tools all have an :option:`--input-encoding` option.
2. When adding HTML files to |app|, you may need to tell |app| what encoding the files are in. To do this go to Preferences->Plugins->File Type plugins and customize the HTML2Zip plugin, telling it what encoding your HTML files are in. Now when you add HTML files to |app| they will be correctly processed. HTML files from different sources often have different encodings, so you may have to change this setting repeatedly. A common encoding for many files from the web is ``cp1252`` and I would suggest you try that first. 2. When adding HTML files to |app|, you may need to tell |app| what encoding the files are in. To do this go to Preferences->Plugins->File Type plugins and customize the HTML2Zip plugin, telling it what encoding your HTML files are in. Now when you add HTML files to |app| they will be correctly processed. HTML files from different sources often have different encodings, so you may have to change this setting repeatedly. A common encoding for many files from the web is ``cp1252`` and I would suggest you try that first. Note that when converting HTML files, leave the input encoding setting mentioned above blank. This is because the HTML2ZIP plugin automatically converts the HTML files to a standard encoding (utf-8).
3. Embedding fonts: If you are generating an LRF file to read on your SONY Reader, you are limited by the fact that the Reader only supports a few non-English characters in the fonts it comes pre-loaded with. You can work around this problem by embedding a unicode-aware font that supports the character set your file uses into the LRF file. You should embed atleast a serif and a sans-serif font. Be aware that embedding fonts significantly slows down page-turn speed on the reader. 3. Embedding fonts: If you are generating an LRF file to read on your SONY Reader, you are limited by the fact that the Reader only supports a few non-English characters in the fonts it comes pre-loaded with. You can work around this problem by embedding a unicode-aware font that supports the character set your file uses into the LRF file. You should embed atleast a serif and a sans-serif font. Be aware that embedding fonts significantly slows down page-turn speed on the reader.

View File

@ -615,10 +615,12 @@ class BasicNewsRecipe(Recipe):
del o['onload'] del o['onload']
for script in list(soup.findAll('noscript')): for script in list(soup.findAll('noscript')):
script.extract() script.extract()
for attr in self.remove_attributes: for attr in self.remove_attributes:
for x in soup.findAll(attrs={attr:True}): for x in soup.findAll(attrs={attr:True}):
del x[attr] del x[attr]
for base in list(soup.findAll('base')):
base.extract()
return self.postprocess_html(soup, first_fetch) return self.postprocess_html(soup, first_fetch)