Merge from trunk

This commit is contained in:
Charles Haley 2010-09-17 07:35:24 +01:00
commit 078925ed7a
15 changed files with 544 additions and 160 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 KiB

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.adventuregamers.com www.adventuregamers.com
''' '''
@ -10,14 +8,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
class AdventureGamers(BasicNewsRecipe): class AdventureGamers(BasicNewsRecipe):
title = u'Adventure Gamers' title = u'Adventure Gamers'
language = 'en' language = 'en'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Adventure games portal' description = 'Adventure games portal'
publisher = 'Adventure Gamers' publisher = 'Adventure Gamers'
category = 'news, games, adventure, technology' category = 'news, games, adventure, technology'
language = 'en'
oldest_article = 10 oldest_article = 10
delay = 10 delay = 10
max_articles_per_feed = 100 max_articles_per_feed = 100
@ -26,14 +21,25 @@ class AdventureGamers(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
INDEX = u'http://www.adventuregamers.com' INDEX = u'http://www.adventuregamers.com'
extra_css = """
.pageheader_type{font-size: x-large; font-weight: bold; color: #828D74}
.pageheader_title{font-size: xx-large; color: #394128}
.pageheader_byline{font-size: small; font-weight: bold; color: #394128}
.score_bg {display: inline; width: 100%; margin-bottom: 2em}
.score_column_1{ padding-left: 10px; font-size: small; width: 50%}
.score_column_2{ padding-left: 10px; font-size: small; width: 50%}
.score_column_3{ padding-left: 10px; font-size: small; width: 50%}
.score_header{font-size: large; color: #50544A}
.bodytext{display: block}
body{font-family: Helvetica,Arial,sans-serif}
"""
html2lrf_options = [ conversion_options = {
'--comment', description 'comment' : description
, '--category', category , 'tags' : category
, '--publisher', publisher , 'publisher' : publisher
] , 'language' : language
}
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':'content_middle'}) dict(name='div', attrs={'class':'content_middle'})
@ -45,6 +51,7 @@ class AdventureGamers(BasicNewsRecipe):
] ]
remove_tags_after = [dict(name='div', attrs={'class':'toolbar_fat'})] remove_tags_after = [dict(name='div', attrs={'class':'toolbar_fat'})]
remove_attributes = ['width','height']
feeds = [(u'Articles', u'http://feeds2.feedburner.com/AdventureGamers')] feeds = [(u'Articles', u'http://feeds2.feedburner.com/AdventureGamers')]
@ -66,12 +73,12 @@ class AdventureGamers(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
for item in soup.findAll('div', attrs={'class':'floatright'}):
item.extract()
self.append_page(soup, soup.body, 3) self.append_page(soup, soup.body, 3)
pager = soup.find('div',attrs={'class':'toolbar_fat'}) pager = soup.find('div',attrs={'class':'toolbar_fat'})
if pager: if pager:
pager.extract() pager.extract()
return soup return self.adeify_images(soup)

View File

@ -10,12 +10,31 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif' masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'id':['cxArticleContent']}) dict(name='div', attrs={'class':['cxArticleHeader']})
,dict(attrs={'id':['cxArticleText','cxArticleBodyText']}) ,dict(attrs={'id':['cxArticleText']})
] ]
remove_tags = [
dict(name='div' , attrs={'class':'cxArticleList' })
,dict(name='div' , attrs={'class':'cxFeedTease' })
,dict(name='div' , attrs={'class':'cxElementEnlarge' })
,dict(name='div' , attrs={'id':'cxArticleTools' })
]
feeds = [ feeds = [
('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'), ('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'),
# ------------------------------------------------------------------- # -------------------------------------------------------------------
@ -23,7 +42,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
# read by simply removing the pound sign from it. I currently have it # read by simply removing the pound sign from it. I currently have it
# set to only get the Cobb area # set to only get the Cobb area
# -------------------------------------------------------------------- # --------------------------------------------------------------------
('Atlanta & Fulton', 'http://www.ajc.com/section-rss.do?source=atlanta'), #('Atlanta & Fulton', 'http://www.ajc.com/section-rss.do?source=atlanta'),
#('Clayton', 'http://www.ajc.com/section-rss.do?source=clayton'), #('Clayton', 'http://www.ajc.com/section-rss.do?source=clayton'),
#('DeKalb', 'http://www.ajc.com/section-rss.do?source=dekalb'), #('DeKalb', 'http://www.ajc.com/section-rss.do?source=dekalb'),
#('Gwinnett', 'http://www.ajc.com/section-rss.do?source=gwinnett'), #('Gwinnett', 'http://www.ajc.com/section-rss.do?source=gwinnett'),
@ -41,7 +60,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
# but again # but again
# You can enable which ever team you like by removing the pound sign # You can enable which ever team you like by removing the pound sign
# ------------------------------------------------------------------------ # ------------------------------------------------------------------------
('Sports News', 'http://www.ajc.com/genericList-rss.do?source=61510'), #('Sports News', 'http://www.ajc.com/genericList-rss.do?source=61510'),
#('Braves', 'http://www.ajc.com/genericList-rss.do?source=61457'), #('Braves', 'http://www.ajc.com/genericList-rss.do?source=61457'),
('Falcons', 'http://www.ajc.com/genericList-rss.do?source=61458'), ('Falcons', 'http://www.ajc.com/genericList-rss.do?source=61458'),
#('Hawks', 'http://www.ajc.com/genericList-rss.do?source=61522'), #('Hawks', 'http://www.ajc.com/genericList-rss.do?source=61522'),
@ -52,11 +71,16 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'), ('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'),
] ]
def postprocess_html(self, soup, first):
for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}):
credit_tag.name ='p'
return soup
#def print_version(self, url):
# return url.partition('?')[0] +'?printArticle=y'
def print_version(self, url):
return url.partition('?')[0] +'?printArticle=y'

View File

@ -0,0 +1,125 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>'
__version__ = '0.95'
''' http://brandeins.de - Wirtschaftsmagazin '''
import re
import string
from calibre.web.feeds.recipes import BasicNewsRecipe
class BrandEins(BasicNewsRecipe):
title = u'Brand Eins'
__author__ = 'Constantin Hofstetter'
description = u'Wirtschaftsmagazin'
publisher ='brandeins.de'
category = 'politics, business, wirtschaft, Germany'
use_embedded_content = False
lang = 'de-DE'
no_stylesheets = True
encoding = 'utf-8'
language = 'de'
# 2 is the last full magazine (default)
# 1 is the newest (but not full)
# 3 is one before 2 etc.
which_ausgabe = 2
keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
'''
brandeins.de
'''
def postprocess_html(self, soup,first):
# Move the image of the sidebar right below the h3
first_h3 = soup.find(name='div', attrs={'id':'theContent'}).find('h3')
for imgdiv in soup.findAll(name='div', attrs={'class':'single_image'}):
if len(first_h3.findNextSiblings('div', {'class':'intro'})) >= 1:
# first_h3.parent.insert(2, imgdiv)
first_h3.findNextSiblings('div', {'class':'intro'})[0].parent.insert(4, imgdiv)
else:
first_h3.parent.insert(2, imgdiv)
# Now, remove the sidebar
soup.find(name='div', attrs={'id':'sidebar'}).extract()
# Remove the rating-image (stars) from the h3
for img in first_h3.findAll(name='img'):
img.extract()
# Mark the intro texts as italic
for div in soup.findAll(name='div', attrs={'class':'intro'}):
for p in div.findAll('p'):
content = self.tag_to_string(p)
new_p = "<p><i>"+ content +"</i></p>"
p.replaceWith(new_p)
return soup
def parse_index(self):
feeds = []
archive = "http://www.brandeins.de/archiv.html"
soup = self.index_to_soup(archive)
latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe]
url = pre_latest_issue.get('href', False)
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
self.title = "Brand Eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date')
url = 'http://brandeins.de/'+url
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
titles_and_articles = self.brand_eins_parse_latest_issue(url)
if titles_and_articles:
for title, articles in titles_and_articles:
feeds.append((title, articles))
return feeds
def brand_eins_parse_latest_issue(self, url):
soup = self.index_to_soup(url)
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
titles_and_articles = []
current_articles = []
chapter_title = "Editorial"
self.log('Found Chapter:', chapter_title)
# Remove last list of links (thats just the impressum and the 'gewinnspiel')
article_lists[1].findAll('ul')[len(article_lists[1].findAll('ul'))-1].extract()
for article_list in article_lists:
for chapter in article_list.findAll('ul'):
if len(chapter.findPreviousSiblings('h3')) >= 1:
new_chapter_title = string.capwords(self.tag_to_string(chapter.findPreviousSiblings('h3')[0]))
if new_chapter_title != chapter_title:
titles_and_articles.append([chapter_title, current_articles])
current_articles = []
self.log('Found Chapter:', new_chapter_title)
chapter_title = new_chapter_title
for li in chapter.findAll('li'):
a = li.find('a', href = True)
if a is None:
continue
title = self.tag_to_string(a)
url = a.get('href', False)
if not url or not title:
continue
url = 'http://brandeins.de/'+url
if len(a.parent.findNextSiblings('p')) >= 1:
description = self.tag_to_string(a.parent.findNextSiblings('p')[0])
else:
description = ''
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
self.log('\t\t\t', description)
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
titles_and_articles.append([chapter_title, current_articles])
return titles_and_articles

View File

@ -1,7 +1,8 @@
#!/usr/bin/env python #!/usr/bin/env python
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
calibre recipe for slate.com calibre recipe for slate.com
''' '''
@ -10,13 +11,12 @@ import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
class PeriodicalNameHere(BasicNewsRecipe): class Slate(BasicNewsRecipe):
# Method variables for customizing downloads # Method variables for customizing downloads
title = 'Slate'
description = 'A general-interest publication offering analysis and commentary about politics, news and culture.' description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
__author__ = 'GRiker and Sujata Raman' __author__ = 'GRiker, Sujata Raman and Nick Redding'
max_articles_per_feed = 20 max_articles_per_feed = 100
oldest_article = 7.0 oldest_article = 14
recursions = 0 recursions = 0
delay = 0 delay = 0
simultaneous_downloads = 5 simultaneous_downloads = 5
@ -27,6 +27,12 @@ class PeriodicalNameHere(BasicNewsRecipe):
encoding = None encoding = None
language = 'en' language = 'en'
slate_complete = True
if slate_complete:
title = 'Slate (complete)'
else:
title = 'Slate (weekly)'
# Method variables for customizing feed parsing # Method variables for customizing feed parsing
summary_length = 250 summary_length = 250
use_embedded_content = None use_embedded_content = None
@ -42,26 +48,15 @@ class PeriodicalNameHere(BasicNewsRecipe):
match_regexps = [] match_regexps = []
# The second entry is for 'Big Money', which comes from a different site, uses different markup # The second entry is for 'Big Money', which comes from a different site, uses different markup
keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body', 'story']}), keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body']}),
dict(attrs={ 'id':['content']}) ] dict(attrs={ 'id':['content']}) ]
# The second entry is for 'Big Money', which comes from a different site, uses different markup # The second entry is for 'Big Money', which comes from a different site, uses different markup
remove_tags = [dict(attrs={ 'id':[ remove_tags = [dict(attrs={ 'id':['toolbox','recommend_tab','insider_ad_wrapper',
'add_comments_button', 'article_bottom_tools_cntr','fray_article_discussion','fray_article_links','bottom_sponsored_links','author_bio',
'article_bottom_tools', 'bizbox_links_bottom','ris_links_wrapper','BOXXLE',
'article_bottom_tools_cntr', 'comments_button','add_comments_button','comments-to-fray','marriott_ad',
'bizbox_links_bottom', 'article_bottom_tools','recommend_tab2','fbog_article_bottom_cntr']}),
'BOXXLE',
'comments_button',
'comments-to-fray',
'fbog_article_bottom_cntr',
'fray_article_discussion', 'fray_article_links','bottom_sponsored_links','author_bio',
'insider_ad_wrapper',
'js_kit_cntr',
'recommend_tab',
'ris_links_wrapper',
'toolbox',
]}),
dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ] dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ]
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast'] excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
@ -72,16 +67,15 @@ class PeriodicalNameHere(BasicNewsRecipe):
extra_css = ''' extra_css = '''
.h1_subhead{font-family:Arial; font-size:small; } .h1_subhead{font-family:Arial; font-size:small; }
h1{font-family:Verdana; font-size:large; } h1{font-family:Verdana; font-size:large; }
.byline {font-family:Georgia; margin-bottom: 0px; color: #660033;} .byline {font-family:Georgia; margin-bottom: 0px; }
.dateline {font-family:Arial; font-size: smaller; height: 0pt; color:#666666;} .dateline {font-family:Arial; font-size: smaller; height: 0pt;}
.imagewrapper {font-family:Verdana;font-size:x-small; } .imagewrapper {font-family:Verdana;font-size:x-small; }
.source {font-family:Verdana; font-size:x-small;} .source {font-family:Verdana; font-size:x-small;}
.credit {font-family:Verdana; font-size: smaller;} .credit {font-family:Verdana; font-size: smaller;}
#article_body {font-family:Verdana; } #article_body {font-family:Verdana; }
#content {font-family:Arial; } #content {font-family:Arial; }
.caption{font-family:Verdana;font-style:italic; font-size:x-small;} .caption{font-family:Verdana;font-style:italic; font-size:x-small;}
h3{font-family:Arial; color:#666666; font-size:small} h3{font-family:Arial; font-size:small}
a{color:#0066CC;}
''' '''
# Local variables to extend class # Local variables to extend class
@ -99,32 +93,59 @@ class PeriodicalNameHere(BasicNewsRecipe):
if isinstance(item, (NavigableString, CData)): if isinstance(item, (NavigableString, CData)):
strings.append(item.string) strings.append(item.string)
elif isinstance(item, Tag): elif isinstance(item, Tag):
res = self.tag_to_string(item) res = self.tag_to_string(item,use_alt=False)
if res: if res:
strings.append(res) strings.append(res)
return strings return strings
def extract_named_sections(self):
def extract_sections(self):
soup = self.index_to_soup( self.baseURL ) soup = self.index_to_soup( self.baseURL )
soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'}) soup_nav_bar = soup.find(True, attrs={'id':'nav'})
briefing_nav = soup.find('li')
briefing_url = briefing_nav.a['href']
for section_nav in soup_nav_bar.findAll('li'):
section_name = self.tag_to_string(section_nav,use_alt=False)
self.section_dates.append(section_name)
soup = self.index_to_soup(briefing_url)
self.log("Briefing url = %s " % briefing_url)
section_lists = soup.findAll('ul','view_links_list')
sections = []
for section in section_lists :
sections.append(section)
return sections
def extract_dated_sections(self):
soup = self.index_to_soup( self.baseURL )
soup_top_stories = soup.find(True, attrs={'id':'tap3_cntr'})
if soup_top_stories:
self.section_dates.append("Top Stories")
self.log("SELECTION TOP STORIES %s" % "Top Stories")
soup = soup.find(True, attrs={'id':'toc_links_container'}) soup = soup.find(True, attrs={'id':'toc_links_container'})
todays_section = soup.find(True, attrs={'class':'todaydateline'}) todays_section = soup.find(True, attrs={'class':'todaydateline'})
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
self.log("SELECTION DATE %s" % self.tag_to_string(todays_section,use_alt=False))
older_section_dates = soup.findAll(True, attrs={'class':'maindateline'}) older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
for older_section in older_section_dates : for older_section in older_section_dates :
self.section_dates.append(self.tag_to_string(older_section,use_alt=False)) self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
self.log("SELECTION DATE %s" % self.tag_to_string(older_section,use_alt=False))
if soup_top_stories: if soup_top_stories:
headline_stories = soup_top_stories.find('ul') headline_stories = soup_top_stories
self.log("HAVE top_stories")
else: else:
headline_stories = None headline_stories = None
self.log("NO top_stories")
section_lists = soup.findAll('ul') section_lists = soup.findAll('ul')
# Prepend the headlines to the first section # Prepend the headlines to the first section
if headline_stories: if headline_stories:
section_lists[0].insert(0,headline_stories) section_lists.insert(0,headline_stories)
sections = [] sections = []
for section in section_lists : for section in section_lists :
@ -133,9 +154,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
def extract_section_articles(self, sections_html) : def extract_section_articles(self, sections_html) :
# Find the containers with section content # Find the containers with section content
soup = self.index_to_soup(str(sections_html)) sections = sections_html
sections = soup.findAll('ul')
articles = {} articles = {}
key = None key = None
@ -145,10 +165,25 @@ class PeriodicalNameHere(BasicNewsRecipe):
# Get the section name # Get the section name
if section.has_key('id') : if section.has_key('id') :
self.log("PROCESSING SECTION id = %s" % section['id'])
key = self.section_dates[i] key = self.section_dates[i]
if key.startswith("Pod"):
continue
if key.startswith("Blog"):
continue
articles[key] = []
ans.append(key)
elif self.slate_complete:
key = self.section_dates[i]
if key.startswith("Pod"):
continue
if key.startswith("Blog"):
continue
self.log("PROCESSING SECTION name = %s" % key)
articles[key] = [] articles[key] = []
ans.append(key) ans.append(key)
else : else :
self.log("SECTION %d HAS NO id" % i);
continue continue
# Get the section article_list # Get the section article_list
@ -159,8 +194,10 @@ class PeriodicalNameHere(BasicNewsRecipe):
bylines = self.tag_to_strings(article) bylines = self.tag_to_strings(article)
url = article.a['href'] url = article.a['href']
title = bylines[0] title = bylines[0]
full_title = self.tag_to_string(article) full_title = self.tag_to_string(article,use_alt=False)
#self.log("ARTICLE TITLE%s" % title)
#self.log("ARTICLE FULL_TITLE%s" % full_title)
#self.log("URL %s" % url)
author = None author = None
description = None description = None
pubdate = None pubdate = None
@ -191,7 +228,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
excluded = re.compile('|'.join(self.excludedDescriptionKeywords)) excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
found_excluded = excluded.search(description) found_excluded = excluded.search(description)
if found_excluded : if found_excluded :
if self.verbose : self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0))) self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
continue continue
# Skip articles whose title contain excluded keywords # Skip articles whose title contain excluded keywords
@ -200,7 +237,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
#self.log("evaluating full_title: %s" % full_title) #self.log("evaluating full_title: %s" % full_title)
found_excluded = excluded.search(full_title) found_excluded = excluded.search(full_title)
if found_excluded : if found_excluded :
if self.verbose : self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0))) self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
continue continue
# Skip articles whose author contain excluded keywords # Skip articles whose author contain excluded keywords
@ -208,7 +245,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
excluded = re.compile('|'.join(self.excludedAuthorKeywords)) excluded = re.compile('|'.join(self.excludedAuthorKeywords))
found_excluded = excluded.search(author) found_excluded = excluded.search(author)
if found_excluded : if found_excluded :
if self.verbose : self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0))) self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
continue continue
skip_this_article = False skip_this_article = False
@ -216,6 +253,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
for article in articles[key] : for article in articles[key] :
if article['url'] == url : if article['url'] == url :
skip_this_article = True skip_this_article = True
self.log("SKIPPING DUP %s" % url)
break break
if skip_this_article : if skip_this_article :
@ -227,6 +265,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
articles[feed] = [] articles[feed] = []
articles[feed].append(dict(title=title, url=url, date=pubdate, description=description, articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
author=author, content='')) author=author, content=''))
#self.log("KEY %s" % feed)
#self.log("APPENDED %s" % url)
# Promote 'newspapers' to top # Promote 'newspapers' to top
for (i,article) in enumerate(articles[feed]) : for (i,article) in enumerate(articles[feed]) :
if article['description'] is not None : if article['description'] is not None :
@ -235,32 +275,6 @@ class PeriodicalNameHere(BasicNewsRecipe):
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
ans = self.remove_duplicates(ans)
return ans
def flatten_document(self, ans):
flat_articles = []
for (i,section) in enumerate(ans) :
#self.log("flattening section %s: " % section[0])
for article in section[1] :
#self.log("moving %s to flat_articles[]" % article['title'])
flat_articles.append(article)
flat_section = ['All Articles', flat_articles]
flat_ans = [flat_section]
return flat_ans
def remove_duplicates(self, ans):
# Return a stripped ans
for (i,section) in enumerate(ans) :
#self.log("section %s: " % section[0])
for article in section[1] :
#self.log("\t%s" % article['title'])
#self.log("\looking for %s" % article['url'])
for (j,subsequent_section) in enumerate(ans[i+1:]) :
for (k,subsequent_article) in enumerate(subsequent_section[1]) :
if article['url'] == subsequent_article['url'] :
#self.log( "removing %s (%s) from %s" % (subsequent_article['title'], subsequent_article['url'], subsequent_section[0]) )
del subsequent_section[1][k]
return ans return ans
def print_version(self, url) : def print_version(self, url) :
@ -268,13 +282,22 @@ class PeriodicalNameHere(BasicNewsRecipe):
# Class methods # Class methods
def parse_index(self) : def parse_index(self) :
sections = self.extract_sections() if self.slate_complete:
sections = self.extract_named_sections()
else:
sections = self.extract_dated_sections()
section_list = self.extract_section_articles(sections) section_list = self.extract_section_articles(sections)
section_list = self.flatten_document(section_list)
return section_list return section_list
def get_browser(self) : def get_masthead_url(self):
return BasicNewsRecipe.get_browser() masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nMasthead unavailable")
masthead = None
return masthead
def stripAnchors(self,soup): def stripAnchors(self,soup):
body = soup.find('div',attrs={'id':['article_body','content']}) body = soup.find('div',attrs={'id':['article_body','content']})
@ -304,8 +327,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
excluded = re.compile('|'.join(self.excludedContentKeywords)) excluded = re.compile('|'.join(self.excludedContentKeywords))
found_excluded = excluded.search(str(soup)) found_excluded = excluded.search(str(soup))
if found_excluded : if found_excluded :
print "no allowed content found, removing article" print "No allowed content found, removing article"
raise Exception('String error') raise Exception('Rejected article')
# Articles from www.thebigmoney.com use different tagging for byline, dateline and body # Articles from www.thebigmoney.com use different tagging for byline, dateline and body
head = soup.find('head') head = soup.find('head')
@ -338,7 +361,6 @@ class PeriodicalNameHere(BasicNewsRecipe):
dept_kicker = soup.find('div', attrs={'class':'department_kicker'}) dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
if dept_kicker is not None : if dept_kicker is not None :
kicker_strings = self.tag_to_strings(dept_kicker) kicker_strings = self.tag_to_strings(dept_kicker)
#kicker = kicker_strings[2] + kicker_strings[3]
kicker = ''.join(kicker_strings[2:]) kicker = ''.join(kicker_strings[2:])
kicker = re.sub('\.','',kicker) kicker = re.sub('\.','',kicker)
h3Tag = Tag(soup, "h3") h3Tag = Tag(soup, "h3")
@ -346,25 +368,11 @@ class PeriodicalNameHere(BasicNewsRecipe):
emTag.insert(0,NavigableString(kicker)) emTag.insert(0,NavigableString(kicker))
h3Tag.insert(0, emTag) h3Tag.insert(0, emTag)
dept_kicker.replaceWith(h3Tag) dept_kicker.replaceWith(h3Tag)
else:
self.log("No kicker--return null")
return None
# Change <h1> to <h2> # Fix up the concatenated byline and dateline
headline = soup.find("h1")
#tag = headline.find("span")
#tag.name = 'div'
if headline is not None :
h2tag = Tag(soup, "h2")
h2tag['class'] = "headline"
strs = self.tag_to_strings(headline)
result = ''
for (i,substr) in enumerate(strs) :
result += substr
if i < len(strs) -1 :
result += '<br />'
#h2tag.insert(0, result)
#headline.replaceWith(h2tag)
# Fix up the concatenated byline and dateline
byline = soup.find(True,attrs={'class':'byline'}) byline = soup.find(True,attrs={'class':'byline'})
if byline is not None : if byline is not None :
bylineTag = Tag(soup,'div') bylineTag = Tag(soup,'div')

View File

@ -5,15 +5,16 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Timothy Legge <timlegge at gmail.com> and Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2010, Timothy Legge <timlegge at gmail.com> and Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os import os, time
import sqlite3 as sqlite import sqlite3 as sqlite
from calibre.devices.usbms.books import BookList from calibre.devices.usbms.books import BookList
from calibre.devices.kobo.books import Book from calibre.devices.kobo.books import Book
from calibre.devices.kobo.books import ImageWrapper from calibre.devices.kobo.books import ImageWrapper
from calibre.devices.mime import mime_type_ext from calibre.devices.mime import mime_type_ext
from calibre.devices.usbms.driver import USBMS from calibre.devices.usbms.driver import USBMS, debug_print
from calibre import prints from calibre import prints
from calibre.devices.usbms.books import CollectionsBookList
class KOBO(USBMS): class KOBO(USBMS):
@ -21,12 +22,15 @@ class KOBO(USBMS):
gui_name = 'Kobo Reader' gui_name = 'Kobo Reader'
description = _('Communicate with the Kobo Reader') description = _('Communicate with the Kobo Reader')
author = 'Timothy Legge and Kovid Goyal' author = 'Timothy Legge and Kovid Goyal'
version = (1, 0, 4) version = (1, 0, 6)
supported_platforms = ['windows', 'osx', 'linux'] supported_platforms = ['windows', 'osx', 'linux']
booklist_class = CollectionsBookList
# Ordered list of supported formats # Ordered list of supported formats
FORMATS = ['epub', 'pdf'] FORMATS = ['epub', 'pdf']
CAN_SET_METADATA = True
VENDOR_ID = [0x2237] VENDOR_ID = [0x2237]
PRODUCT_ID = [0x4161] PRODUCT_ID = [0x4161]
@ -40,6 +44,12 @@ class KOBO(USBMS):
VIRTUAL_BOOK_EXTENSIONS = frozenset(['kobo']) VIRTUAL_BOOK_EXTENSIONS = frozenset(['kobo'])
EXTRA_CUSTOMIZATION_MESSAGE = _('The Kobo supports only one collection '
'currently: the \"Im_Reading\" list. Create a tag called \"Im_Reading\" ')+\
'for automatic management'
EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(['tags'])
def initialize(self): def initialize(self):
USBMS.initialize(self) USBMS.initialize(self)
self.book_class = Book self.book_class = Book
@ -63,6 +73,8 @@ class KOBO(USBMS):
self._card_b_prefix if oncard == 'cardb' \ self._card_b_prefix if oncard == 'cardb' \
else self._main_prefix else self._main_prefix
self.booklist_class.rebuild_collections = self.rebuild_collections
# get the metadata cache # get the metadata cache
bl = self.booklist_class(oncard, prefix, self.settings) bl = self.booklist_class(oncard, prefix, self.settings)
need_sync = self.parse_metadata_cache(bl, prefix, self.METADATA_CACHE) need_sync = self.parse_metadata_cache(bl, prefix, self.METADATA_CACHE)
@ -85,9 +97,7 @@ class KOBO(USBMS):
playlist_map = {} playlist_map = {}
if readstatus == 1: if readstatus == 1:
if lpath not in playlist_map: playlist_map[lpath]= "Im_Reading"
playlist_map[lpath] = []
playlist_map[lpath].append("I\'m Reading")
path = self.normalize_path(path) path = self.normalize_path(path)
# print "Normalized FileName: " + path # print "Normalized FileName: " + path
@ -104,14 +114,17 @@ class KOBO(USBMS):
if self.update_metadata_item(bl[idx]): if self.update_metadata_item(bl[idx]):
# print 'update_metadata_item returned true' # print 'update_metadata_item returned true'
changed = True changed = True
bl[idx].device_collections = playlist_map.get(lpath, []) if lpath in playlist_map and \
playlist_map[lpath] not in bl[idx].device_collections:
bl[idx].device_collections.append(playlist_map[lpath])
else: else:
if ContentType == '6': if ContentType == '6':
book = Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=1048576) book = Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=1048576)
else: else:
book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID) book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID)
# print 'Update booklist' # print 'Update booklist'
book.device_collections = playlist_map.get(book.lpath, []) book.device_collections = [playlist_map[lpath]] if lpath in playlist_map else []
if bl.add_book(book, replace_metadata=False): if bl.add_book(book, replace_metadata=False):
changed = True changed = True
except: # Probably a path encoding error except: # Probably a path encoding error
@ -398,3 +411,95 @@ class KOBO(USBMS):
size = os.stat(cls.normalize_path(os.path.join(prefix, lpath))).st_size size = os.stat(cls.normalize_path(os.path.join(prefix, lpath))).st_size
book = Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=size, other=mi) book = Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=size, other=mi)
return book return book
def get_device_paths(self):
paths, prefixes = {}, {}
for prefix, path, source_id in [
('main', 'metadata.calibre', 0),
('card_a', 'metadata.calibre', 1),
('card_b', 'metadata.calibre', 2)
]:
prefix = getattr(self, '_%s_prefix'%prefix)
if prefix is not None and os.path.exists(prefix):
paths[source_id] = os.path.join(prefix, *(path.split('/')))
return paths
def update_device_database_collections(self, booklists, collections_attributes):
# debug_print('Starting update_device_database_collections', collections_attributes)
# Force collections_attributes to be 'tags' as no other is currently supported
# debug_print('KOBO: overriding the provided collections_attributes:', collections_attributes)
collections_attributes = ['tags']
collections = booklists.get_collections(collections_attributes)
# debug_print('Collections', collections)
for category, books in collections.items():
if category == 'Im_Reading':
# Create a connection to the sqlite database
connection = sqlite.connect(self._main_prefix + '.kobo/KoboReader.sqlite')
cursor = connection.cursor()
# Reset Im_Reading list in the database
query= 'update content set ReadStatus=0, FirstTimeReading = \'true\' where BookID is Null'
try:
cursor.execute (query)
except:
debug_print('Database Exception: Unable to reset Im_Reading list')
raise
else:
# debug_print('Commit: Reset Im_Reading list')
connection.commit()
for book in books:
# debug_print('Title:', book.title, 'lpath:', book.path)
book.device_collections = ['Im_Reading']
extension = os.path.splitext(book.path)[1]
ContentType = self.get_content_type_from_extension(extension)
ContentID = self.contentid_from_path(book.path, ContentType)
datelastread = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime())
t = (datelastread,ContentID,)
try:
cursor.execute('update content set ReadStatus=1,FirstTimeReading=\'false\',DateLastRead=? where BookID is Null and ContentID = ?', t)
except:
debug_print('Database Exception: Unable create Im_Reading list')
raise
else:
connection.commit()
# debug_print('Database: Commit create Im_Reading list')
cursor.close()
connection.close()
# debug_print('Finished update_device_database_collections', collections_attributes)
def sync_booklists(self, booklists, end_session=True):
# debug_print('KOBO: started sync_booklists')
paths = self.get_device_paths()
blists = {}
for i in paths:
if booklists[i] is not None:
#debug_print('Booklist: ', i)
blists[i] = booklists[i]
opts = self.settings()
if opts.extra_customization:
collections = [x.lower().strip() for x in
opts.extra_customization.split(',')]
else:
collections = []
#debug_print('KOBO: collection fields:', collections)
for i, blist in blists.items():
self.update_device_database_collections(blist, collections)
USBMS.sync_booklists(self, booklists, end_session=end_session)
#debug_print('KOBO: finished sync_booklists')
def rebuild_collections(self, booklist, oncard):
collections_attributes = []
self.update_device_database_collections(booklist, collections_attributes)

View File

@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber):
'chapter', 'chapter_mark', 'chapter', 'chapter_mark',
'prefer_metadata_cover', 'remove_first_image', 'prefer_metadata_cover', 'remove_first_image',
'insert_metadata', 'page_breaks_before', 'insert_metadata', 'page_breaks_before',
'preprocess_html', 'preprocess_html', 'html_unwrap_factor',
] ]
), ),

View File

@ -362,6 +362,15 @@ OptionRecommendation(name='preprocess_html',
) )
), ),
OptionRecommendation(name='html_unwrap_factor',
recommended_value=0.40, level=OptionRecommendation.LOW,
help=_('Scale used to determine the length at which a line should '
'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
'default is 0.40, just below the median line length. This will unwrap typical books '
' with hard line breaks, but should be reduced if the line length is variable.'
)
),
OptionRecommendation(name='smarten_punctuation', OptionRecommendation(name='smarten_punctuation',
recommended_value=False, level=OptionRecommendation.LOW, recommended_value=False, level=OptionRecommendation.LOW,
help=_('Convert plain quotes, dashes and ellipsis to their ' help=_('Convert plain quotes, dashes and ellipsis to their '

View File

@ -351,7 +351,7 @@ class HTMLPreProcessor(object):
# print "The pdf line length returned is " + str(length) # print "The pdf line length returned is " + str(length)
end_rules.append( end_rules.append(
# Un wrap using punctuation # Un wrap using punctuation
(re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), (re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
) )
for rule in self.PREPROCESS + start_rules: for rule in self.PREPROCESS + start_rules:

View File

@ -11,10 +11,11 @@ from calibre.utils.logging import default_log
class PreProcessor(object): class PreProcessor(object):
def __init__(self, log=None): def __init__(self, log=None, extra_opts=None):
self.log = default_log if log is None else log self.log = default_log if log is None else log
self.html_preprocess_sections = 0 self.html_preprocess_sections = 0
self.found_indents = 0 self.found_indents = 0
self.extra_opts = extra_opts
def chapter_head(self, match): def chapter_head(self, match):
chap = match.group('chap') chap = match.group('chap')
@ -91,6 +92,7 @@ class PreProcessor(object):
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE) blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
blanklines = blankreg.findall(html) blanklines = blankreg.findall(html)
lines = linereg.findall(html) lines = linereg.findall(html)
if len(lines) > 1: if len(lines) > 1:
@ -147,15 +149,16 @@ class PreProcessor(object):
format = 'html' format = 'html'
# Calculate Length # Calculate Length
length = line_length(format, html, 0.4) length = line_length('pdf', html, getattr(self.extra_opts,
'html_unwrap_factor', 0.4))
self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***") self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
# #
# Unwrap and/or delete soft-hyphens, hyphens # Unwrap and/or delete soft-hyphens, hyphens
html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
# Unwrap lines using punctation if the median length of all lines is less than 200 # Unwrap lines using punctation and line length
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
html = unwrap.sub(' ', html) html = unwrap.sub(' ', html)
# If still no sections after unwrapping mark split points on lines with no punctuation # If still no sections after unwrapping mark split points on lines with no punctuation

View File

@ -12,6 +12,7 @@ from copy import deepcopy
from lxml import etree from lxml import etree
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.utils import PreProcessor
from calibre import guess_type from calibre import guess_type
class Canvas(etree.XSLTExtension): class Canvas(etree.XSLTExtension):
@ -419,4 +420,9 @@ class LRFInput(InputFormatPlugin):
styles.write() styles.write()
return os.path.abspath('content.opf') return os.path.abspath('content.opf')
def preprocess_html(self, html):
preprocessor = PreProcessor(log=getattr(self, 'log', None))
return preprocessor(html)

View File

@ -26,8 +26,10 @@ class StructureDetectionWidget(Widget, Ui_Form):
'remove_first_image', 'remove_first_image',
'insert_metadata', 'page_breaks_before', 'insert_metadata', 'page_breaks_before',
'preprocess_html', 'remove_header', 'header_regex', 'preprocess_html', 'remove_header', 'header_regex',
'remove_footer', 'footer_regex'] 'remove_footer', 'footer_regex','html_unwrap_factor']
) )
self.opt_html_unwrap_factor.setEnabled(False)
self.huf_label.setEnabled(False)
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
for x in ('pagebreak', 'rule', 'both', 'none'): for x in ('pagebreak', 'rule', 'both', 'none'):
self.opt_chapter_mark.addItem(x) self.opt_chapter_mark.addItem(x)
@ -64,3 +66,8 @@ class StructureDetectionWidget(Widget, Ui_Form):
_('The XPath expression %s is invalid.')%x.text).exec_() _('The XPath expression %s is invalid.')%x.text).exec_()
return False return False
return True return True
def set_value_handler(self, g, val):
if val is None and g is self.opt_html_unwrap_factor:
g.setValue(0.0)
return True

View File

@ -14,10 +14,10 @@
<string>Form</string> <string>Form</string>
</property> </property>
<layout class="QGridLayout" name="gridLayout"> <layout class="QGridLayout" name="gridLayout">
<item row="0" column="0" colspan="2"> <item row="0" column="1" colspan="2">
<widget class="XPathEdit" name="opt_chapter" native="true"/> <widget class="XPathEdit" name="opt_chapter" native="true"/>
</item> </item>
<item row="1" column="0"> <item row="1" column="0" colspan="2">
<widget class="QLabel" name="label"> <widget class="QLabel" name="label">
<property name="text"> <property name="text">
<string>Chapter &amp;mark:</string> <string>Chapter &amp;mark:</string>
@ -27,31 +27,31 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="1" column="1"> <item row="1" column="2">
<widget class="QComboBox" name="opt_chapter_mark"> <widget class="QComboBox" name="opt_chapter_mark">
<property name="minimumContentsLength"> <property name="minimumContentsLength">
<number>20</number> <number>20</number>
</property> </property>
</widget> </widget>
</item> </item>
<item row="2" column="0"> <item row="2" column="0" colspan="2">
<widget class="QCheckBox" name="opt_remove_first_image"> <widget class="QCheckBox" name="opt_remove_first_image">
<property name="text"> <property name="text">
<string>Remove first &amp;image</string> <string>Remove first &amp;image</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="4" column="0"> <item row="5" column="0" colspan="2">
<widget class="QCheckBox" name="opt_insert_metadata"> <widget class="QCheckBox" name="opt_insert_metadata">
<property name="text"> <property name="text">
<string>Insert &amp;metadata as page at start of book</string> <string>Insert &amp;metadata as page at start of book</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="10" column="0" colspan="2"> <item row="11" column="0" colspan="3">
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/> <widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
</item> </item>
<item row="11" column="0" colspan="2"> <item row="12" column="0" colspan="3">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -64,27 +64,66 @@
</property> </property>
</spacer> </spacer>
</item> </item>
<item row="7" column="0"> <item row="8" column="0" colspan="2">
<widget class="QCheckBox" name="opt_remove_footer"> <widget class="QCheckBox" name="opt_remove_footer">
<property name="text"> <property name="text">
<string>Remove F&amp;ooter</string> <string>Remove F&amp;ooter</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="5" column="0"> <item row="6" column="0" colspan="2">
<widget class="QCheckBox" name="opt_remove_header"> <widget class="QCheckBox" name="opt_remove_header">
<property name="text"> <property name="text">
<string>Remove H&amp;eader</string> <string>Remove H&amp;eader</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="6" column="0" colspan="2"> <item row="7" column="0" colspan="3">
<widget class="RegexEdit" name="opt_header_regex" native="true"/> <widget class="RegexEdit" name="opt_header_regex" native="true"/>
</item> </item>
<item row="8" column="0" colspan="2"> <item row="9" column="0" colspan="3">
<widget class="RegexEdit" name="opt_footer_regex" native="true"/> <widget class="RegexEdit" name="opt_footer_regex" native="true"/>
</item> </item>
<item row="3" column="0"> <item row="4" column="1">
<widget class="QLabel" name="huf_label">
<property name="text">
<string>Line &amp;un-wrap factor during preprocess:</string>
</property>
<property name="buddy">
<cstring>opt_html_unwrap_factor</cstring>
</property>
</widget>
</item>
<item row="4" column="2">
<widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
<property name="toolTip">
<string/>
</property>
<property name="maximum">
<double>1.000000000000000</double>
</property>
<property name="singleStep">
<double>0.050000000000000</double>
</property>
<property name="value">
<double>0.400000000000000</double>
</property>
</widget>
</item>
<item row="4" column="0">
<spacer name="horizontalSpacer">
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>40</width>
<height>20</height>
</size>
</property>
</spacer>
</item>
<item row="3" column="0" colspan="2">
<widget class="QCheckBox" name="opt_preprocess_html"> <widget class="QCheckBox" name="opt_preprocess_html">
<property name="text"> <property name="text">
<string>&amp;Preprocess input file to possibly improve structure detection</string> <string>&amp;Preprocess input file to possibly improve structure detection</string>
@ -108,5 +147,38 @@
</customwidget> </customwidget>
</customwidgets> </customwidgets>
<resources/> <resources/>
<connections/> <connections>
<connection>
<sender>opt_preprocess_html</sender>
<signal>toggled(bool)</signal>
<receiver>opt_html_unwrap_factor</receiver>
<slot>setEnabled(bool)</slot>
<hints>
<hint type="sourcelabel">
<x>328</x>
<y>87</y>
</hint>
<hint type="destinationlabel">
<x>481</x>
<y>113</y>
</hint>
</hints>
</connection>
<connection>
<sender>opt_preprocess_html</sender>
<signal>toggled(bool)</signal>
<receiver>huf_label</receiver>
<slot>setEnabled(bool)</slot>
<hints>
<hint type="sourcelabel">
<x>295</x>
<y>88</y>
</hint>
<hint type="destinationlabel">
<x>291</x>
<y>105</y>
</hint>
</hints>
</connection>
</connections>
</ui> </ui>

View File

@ -6,10 +6,7 @@ The dialog used to edit meta information for a book as well as
add/remove formats add/remove formats
''' '''
import os import os, re, time, traceback, textwrap
import re
import time
import traceback
from PyQt4.Qt import SIGNAL, QObject, Qt, QTimer, QThread, QDate, \ from PyQt4.Qt import SIGNAL, QObject, Qt, QTimer, QThread, QDate, \
QPixmap, QListWidgetItem, QDialog, pyqtSignal QPixmap, QListWidgetItem, QDialog, pyqtSignal
@ -331,6 +328,14 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
ResizableDialog.__init__(self, window) ResizableDialog.__init__(self, window)
self.bc_box.layout().setAlignment(self.cover, Qt.AlignCenter|Qt.AlignHCenter) self.bc_box.layout().setAlignment(self.cover, Qt.AlignCenter|Qt.AlignHCenter)
self.cancel_all = False self.cancel_all = False
base = unicode(self.author_sort.toolTip())
self.ok_aus_tooltip = '<p>' + textwrap.fill(base+'<br><br>'+
_(' The green color indicates that the current '
'author sort matches the current author'))
self.bad_aus_tooltip = '<p>'+textwrap.fill(base + '<br><br>'+
_(' The red color indicates that the current '
'author sort does not match the current author'))
if cancel_all: if cancel_all:
self.__abort_button = self.button_box.addButton(self.button_box.Abort) self.__abort_button = self.button_box.addButton(self.button_box.Abort)
self.__abort_button.setToolTip(_('Abort the editing of all remaining books')) self.__abort_button.setToolTip(_('Abort the editing of all remaining books'))
@ -490,6 +495,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
col = 'rgb(255, 0, 0, 20%)' col = 'rgb(255, 0, 0, 20%)'
self.author_sort.setStyleSheet('QLineEdit { color: black; ' self.author_sort.setStyleSheet('QLineEdit { color: black; '
'background-color: %s; }'%col) 'background-color: %s; }'%col)
tt = self.ok_aus_tooltip if normal else self.bad_aus_tooltip
self.author_sort.setToolTip(tt)
def validate_isbn(self, isbn): def validate_isbn(self, isbn):
isbn = unicode(isbn).strip() isbn = unicode(isbn).strip()

View File

@ -330,6 +330,17 @@ There are a few more options in this section.
two covers. This option will simply remove the first image from the source document, thereby two covers. This option will simply remove the first image from the source document, thereby
ensuring that the converted book has only one cover, the one specified in |app|. ensuring that the converted book has only one cover, the one specified in |app|.
:guilabel:`Preprocess input`
This option activates various algorithms that try to detect and correct common cases of
badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
Turn this option on if your input document suffers from bad formatting. But be aware that in
some cases, this option can lead to worse results, so use with care.
:guilabel:`Line-unwrap factor`
This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
than the length of 40% of all lines in the document.
Table of Contents Table of Contents
------------------ ------------------