mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge from trunk
This commit is contained in:
commit
078925ed7a
BIN
resources/images/news/ajc.png
Normal file
BIN
resources/images/news/ajc.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.0 KiB |
@ -1,7 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.adventuregamers.com
|
||||
'''
|
||||
@ -10,14 +8,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdventureGamers(BasicNewsRecipe):
|
||||
title = u'Adventure Gamers'
|
||||
language = 'en'
|
||||
|
||||
language = 'en'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Adventure games portal'
|
||||
description = 'Adventure games portal'
|
||||
publisher = 'Adventure Gamers'
|
||||
category = 'news, games, adventure, technology'
|
||||
language = 'en'
|
||||
|
||||
category = 'news, games, adventure, technology'
|
||||
oldest_article = 10
|
||||
delay = 10
|
||||
max_articles_per_feed = 100
|
||||
@ -26,14 +21,25 @@ class AdventureGamers(BasicNewsRecipe):
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
INDEX = u'http://www.adventuregamers.com'
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment', description
|
||||
, '--category', category
|
||||
, '--publisher', publisher
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
extra_css = """
|
||||
.pageheader_type{font-size: x-large; font-weight: bold; color: #828D74}
|
||||
.pageheader_title{font-size: xx-large; color: #394128}
|
||||
.pageheader_byline{font-size: small; font-weight: bold; color: #394128}
|
||||
.score_bg {display: inline; width: 100%; margin-bottom: 2em}
|
||||
.score_column_1{ padding-left: 10px; font-size: small; width: 50%}
|
||||
.score_column_2{ padding-left: 10px; font-size: small; width: 50%}
|
||||
.score_column_3{ padding-left: 10px; font-size: small; width: 50%}
|
||||
.score_header{font-size: large; color: #50544A}
|
||||
.bodytext{display: block}
|
||||
body{font-family: Helvetica,Arial,sans-serif}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'content_middle'})
|
||||
@ -43,14 +49,15 @@ class AdventureGamers(BasicNewsRecipe):
|
||||
dict(name=['object','link','embed','form'])
|
||||
,dict(name='div', attrs={'class':['related-stories','article_leadout','prev','next','both']})
|
||||
]
|
||||
|
||||
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'toolbar_fat'})]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [(u'Articles', u'http://feeds2.feedburner.com/AdventureGamers')]
|
||||
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
|
||||
|
||||
def append_page(self, soup, appendtag, position):
|
||||
pager = soup.find('div',attrs={'class':'toolbar_fat_next'})
|
||||
if pager:
|
||||
@ -59,19 +66,19 @@ class AdventureGamers(BasicNewsRecipe):
|
||||
texttag = soup2.find('div', attrs={'class':'bodytext'})
|
||||
for it in texttag.findAll(style=True):
|
||||
del it['style']
|
||||
newpos = len(texttag.contents)
|
||||
newpos = len(texttag.contents)
|
||||
self.append_page(soup2,texttag,newpos)
|
||||
texttag.extract()
|
||||
appendtag.insert(position,texttag)
|
||||
|
||||
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
|
||||
soup.head.insert(0,mtag)
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('div', attrs={'class':'floatright'}):
|
||||
item.extract()
|
||||
self.append_page(soup, soup.body, 3)
|
||||
pager = soup.find('div',attrs={'class':'toolbar_fat'})
|
||||
if pager:
|
||||
pager.extract()
|
||||
return soup
|
||||
pager.extract()
|
||||
return self.adeify_images(soup)
|
||||
|
@ -10,12 +10,31 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
|
||||
|
||||
masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':['cxArticleContent']})
|
||||
,dict(attrs={'id':['cxArticleText','cxArticleBodyText']})
|
||||
dict(name='div', attrs={'class':['cxArticleHeader']})
|
||||
,dict(attrs={'id':['cxArticleText']})
|
||||
]
|
||||
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div' , attrs={'class':'cxArticleList' })
|
||||
,dict(name='div' , attrs={'class':'cxFeedTease' })
|
||||
,dict(name='div' , attrs={'class':'cxElementEnlarge' })
|
||||
,dict(name='div' , attrs={'id':'cxArticleTools' })
|
||||
]
|
||||
|
||||
|
||||
|
||||
feeds = [
|
||||
('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'),
|
||||
# -------------------------------------------------------------------
|
||||
@ -23,7 +42,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
|
||||
# read by simply removing the pound sign from it. I currently have it
|
||||
# set to only get the Cobb area
|
||||
# --------------------------------------------------------------------
|
||||
('Atlanta & Fulton', 'http://www.ajc.com/section-rss.do?source=atlanta'),
|
||||
#('Atlanta & Fulton', 'http://www.ajc.com/section-rss.do?source=atlanta'),
|
||||
#('Clayton', 'http://www.ajc.com/section-rss.do?source=clayton'),
|
||||
#('DeKalb', 'http://www.ajc.com/section-rss.do?source=dekalb'),
|
||||
#('Gwinnett', 'http://www.ajc.com/section-rss.do?source=gwinnett'),
|
||||
@ -41,7 +60,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
|
||||
# but again
|
||||
# You can enable which ever team you like by removing the pound sign
|
||||
# ------------------------------------------------------------------------
|
||||
('Sports News', 'http://www.ajc.com/genericList-rss.do?source=61510'),
|
||||
#('Sports News', 'http://www.ajc.com/genericList-rss.do?source=61510'),
|
||||
#('Braves', 'http://www.ajc.com/genericList-rss.do?source=61457'),
|
||||
('Falcons', 'http://www.ajc.com/genericList-rss.do?source=61458'),
|
||||
#('Hawks', 'http://www.ajc.com/genericList-rss.do?source=61522'),
|
||||
@ -52,11 +71,16 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
|
||||
('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'),
|
||||
]
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}):
|
||||
credit_tag.name ='p'
|
||||
|
||||
return soup
|
||||
|
||||
#def print_version(self, url):
|
||||
# return url.partition('?')[0] +'?printArticle=y'
|
||||
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url.partition('?')[0] +'?printArticle=y'
|
||||
|
||||
|
||||
|
||||
|
125
resources/recipes/brand_eins.recipe
Normal file
125
resources/recipes/brand_eins.recipe
Normal file
@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>'
|
||||
__version__ = '0.95'
|
||||
|
||||
''' http://brandeins.de - Wirtschaftsmagazin '''
|
||||
import re
|
||||
import string
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class BrandEins(BasicNewsRecipe):
|
||||
|
||||
title = u'Brand Eins'
|
||||
__author__ = 'Constantin Hofstetter'
|
||||
description = u'Wirtschaftsmagazin'
|
||||
publisher ='brandeins.de'
|
||||
category = 'politics, business, wirtschaft, Germany'
|
||||
use_embedded_content = False
|
||||
lang = 'de-DE'
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
language = 'de'
|
||||
|
||||
# 2 is the last full magazine (default)
|
||||
# 1 is the newest (but not full)
|
||||
# 3 is one before 2 etc.
|
||||
which_ausgabe = 2
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
|
||||
|
||||
'''
|
||||
brandeins.de
|
||||
'''
|
||||
|
||||
def postprocess_html(self, soup,first):
|
||||
|
||||
# Move the image of the sidebar right below the h3
|
||||
first_h3 = soup.find(name='div', attrs={'id':'theContent'}).find('h3')
|
||||
for imgdiv in soup.findAll(name='div', attrs={'class':'single_image'}):
|
||||
if len(first_h3.findNextSiblings('div', {'class':'intro'})) >= 1:
|
||||
# first_h3.parent.insert(2, imgdiv)
|
||||
first_h3.findNextSiblings('div', {'class':'intro'})[0].parent.insert(4, imgdiv)
|
||||
else:
|
||||
first_h3.parent.insert(2, imgdiv)
|
||||
|
||||
# Now, remove the sidebar
|
||||
soup.find(name='div', attrs={'id':'sidebar'}).extract()
|
||||
|
||||
# Remove the rating-image (stars) from the h3
|
||||
for img in first_h3.findAll(name='img'):
|
||||
img.extract()
|
||||
|
||||
# Mark the intro texts as italic
|
||||
for div in soup.findAll(name='div', attrs={'class':'intro'}):
|
||||
for p in div.findAll('p'):
|
||||
content = self.tag_to_string(p)
|
||||
new_p = "<p><i>"+ content +"</i></p>"
|
||||
p.replaceWith(new_p)
|
||||
|
||||
return soup
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
|
||||
archive = "http://www.brandeins.de/archiv.html"
|
||||
|
||||
soup = self.index_to_soup(archive)
|
||||
latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
|
||||
pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe]
|
||||
url = pre_latest_issue.get('href', False)
|
||||
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
|
||||
self.title = "Brand Eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date')
|
||||
url = 'http://brandeins.de/'+url
|
||||
|
||||
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
|
||||
titles_and_articles = self.brand_eins_parse_latest_issue(url)
|
||||
if titles_and_articles:
|
||||
for title, articles in titles_and_articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
def brand_eins_parse_latest_issue(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
|
||||
|
||||
titles_and_articles = []
|
||||
current_articles = []
|
||||
chapter_title = "Editorial"
|
||||
self.log('Found Chapter:', chapter_title)
|
||||
|
||||
# Remove last list of links (thats just the impressum and the 'gewinnspiel')
|
||||
article_lists[1].findAll('ul')[len(article_lists[1].findAll('ul'))-1].extract()
|
||||
|
||||
for article_list in article_lists:
|
||||
for chapter in article_list.findAll('ul'):
|
||||
if len(chapter.findPreviousSiblings('h3')) >= 1:
|
||||
new_chapter_title = string.capwords(self.tag_to_string(chapter.findPreviousSiblings('h3')[0]))
|
||||
if new_chapter_title != chapter_title:
|
||||
titles_and_articles.append([chapter_title, current_articles])
|
||||
current_articles = []
|
||||
self.log('Found Chapter:', new_chapter_title)
|
||||
chapter_title = new_chapter_title
|
||||
for li in chapter.findAll('li'):
|
||||
a = li.find('a', href = True)
|
||||
if a is None:
|
||||
continue
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
if not url or not title:
|
||||
continue
|
||||
url = 'http://brandeins.de/'+url
|
||||
if len(a.parent.findNextSiblings('p')) >= 1:
|
||||
description = self.tag_to_string(a.parent.findNextSiblings('p')[0])
|
||||
else:
|
||||
description = ''
|
||||
|
||||
self.log('\t\tFound article:', title)
|
||||
self.log('\t\t\t', url)
|
||||
self.log('\t\t\t', description)
|
||||
|
||||
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
|
||||
titles_and_articles.append([chapter_title, current_articles])
|
||||
return titles_and_articles
|
@ -1,7 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
'''
|
||||
calibre recipe for slate.com
|
||||
'''
|
||||
@ -10,13 +11,12 @@ import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
|
||||
|
||||
class PeriodicalNameHere(BasicNewsRecipe):
|
||||
class Slate(BasicNewsRecipe):
|
||||
# Method variables for customizing downloads
|
||||
title = 'Slate'
|
||||
description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
|
||||
__author__ = 'GRiker and Sujata Raman'
|
||||
max_articles_per_feed = 20
|
||||
oldest_article = 7.0
|
||||
__author__ = 'GRiker, Sujata Raman and Nick Redding'
|
||||
max_articles_per_feed = 100
|
||||
oldest_article = 14
|
||||
recursions = 0
|
||||
delay = 0
|
||||
simultaneous_downloads = 5
|
||||
@ -27,6 +27,12 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
encoding = None
|
||||
language = 'en'
|
||||
|
||||
slate_complete = True
|
||||
if slate_complete:
|
||||
title = 'Slate (complete)'
|
||||
else:
|
||||
title = 'Slate (weekly)'
|
||||
|
||||
# Method variables for customizing feed parsing
|
||||
summary_length = 250
|
||||
use_embedded_content = None
|
||||
@ -42,26 +48,15 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
match_regexps = []
|
||||
|
||||
# The second entry is for 'Big Money', which comes from a different site, uses different markup
|
||||
keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body', 'story']}),
|
||||
keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body']}),
|
||||
dict(attrs={ 'id':['content']}) ]
|
||||
|
||||
# The second entry is for 'Big Money', which comes from a different site, uses different markup
|
||||
remove_tags = [dict(attrs={ 'id':[
|
||||
'add_comments_button',
|
||||
'article_bottom_tools',
|
||||
'article_bottom_tools_cntr',
|
||||
'bizbox_links_bottom',
|
||||
'BOXXLE',
|
||||
'comments_button',
|
||||
'comments-to-fray',
|
||||
'fbog_article_bottom_cntr',
|
||||
'fray_article_discussion', 'fray_article_links','bottom_sponsored_links','author_bio',
|
||||
'insider_ad_wrapper',
|
||||
'js_kit_cntr',
|
||||
'recommend_tab',
|
||||
'ris_links_wrapper',
|
||||
'toolbox',
|
||||
]}),
|
||||
remove_tags = [dict(attrs={ 'id':['toolbox','recommend_tab','insider_ad_wrapper',
|
||||
'article_bottom_tools_cntr','fray_article_discussion','fray_article_links','bottom_sponsored_links','author_bio',
|
||||
'bizbox_links_bottom','ris_links_wrapper','BOXXLE',
|
||||
'comments_button','add_comments_button','comments-to-fray','marriott_ad',
|
||||
'article_bottom_tools','recommend_tab2','fbog_article_bottom_cntr']}),
|
||||
dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ]
|
||||
|
||||
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
|
||||
@ -72,16 +67,15 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
extra_css = '''
|
||||
.h1_subhead{font-family:Arial; font-size:small; }
|
||||
h1{font-family:Verdana; font-size:large; }
|
||||
.byline {font-family:Georgia; margin-bottom: 0px; color: #660033;}
|
||||
.dateline {font-family:Arial; font-size: smaller; height: 0pt; color:#666666;}
|
||||
.byline {font-family:Georgia; margin-bottom: 0px; }
|
||||
.dateline {font-family:Arial; font-size: smaller; height: 0pt;}
|
||||
.imagewrapper {font-family:Verdana;font-size:x-small; }
|
||||
.source {font-family:Verdana; font-size:x-small;}
|
||||
.credit {font-family:Verdana; font-size: smaller;}
|
||||
#article_body {font-family:Verdana; }
|
||||
#content {font-family:Arial; }
|
||||
.caption{font-family:Verdana;font-style:italic; font-size:x-small;}
|
||||
h3{font-family:Arial; color:#666666; font-size:small}
|
||||
a{color:#0066CC;}
|
||||
h3{font-family:Arial; font-size:small}
|
||||
'''
|
||||
|
||||
# Local variables to extend class
|
||||
@ -99,32 +93,59 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
if isinstance(item, (NavigableString, CData)):
|
||||
strings.append(item.string)
|
||||
elif isinstance(item, Tag):
|
||||
res = self.tag_to_string(item)
|
||||
res = self.tag_to_string(item,use_alt=False)
|
||||
if res:
|
||||
strings.append(res)
|
||||
return strings
|
||||
|
||||
|
||||
def extract_sections(self):
|
||||
def extract_named_sections(self):
|
||||
soup = self.index_to_soup( self.baseURL )
|
||||
soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
|
||||
soup_nav_bar = soup.find(True, attrs={'id':'nav'})
|
||||
briefing_nav = soup.find('li')
|
||||
briefing_url = briefing_nav.a['href']
|
||||
for section_nav in soup_nav_bar.findAll('li'):
|
||||
section_name = self.tag_to_string(section_nav,use_alt=False)
|
||||
self.section_dates.append(section_name)
|
||||
|
||||
soup = self.index_to_soup(briefing_url)
|
||||
|
||||
self.log("Briefing url = %s " % briefing_url)
|
||||
section_lists = soup.findAll('ul','view_links_list')
|
||||
|
||||
sections = []
|
||||
for section in section_lists :
|
||||
sections.append(section)
|
||||
return sections
|
||||
|
||||
|
||||
def extract_dated_sections(self):
|
||||
soup = self.index_to_soup( self.baseURL )
|
||||
soup_top_stories = soup.find(True, attrs={'id':'tap3_cntr'})
|
||||
if soup_top_stories:
|
||||
self.section_dates.append("Top Stories")
|
||||
self.log("SELECTION TOP STORIES %s" % "Top Stories")
|
||||
|
||||
soup = soup.find(True, attrs={'id':'toc_links_container'})
|
||||
|
||||
todays_section = soup.find(True, attrs={'class':'todaydateline'})
|
||||
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
|
||||
self.log("SELECTION DATE %s" % self.tag_to_string(todays_section,use_alt=False))
|
||||
|
||||
older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
|
||||
for older_section in older_section_dates :
|
||||
self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
|
||||
self.log("SELECTION DATE %s" % self.tag_to_string(older_section,use_alt=False))
|
||||
|
||||
if soup_top_stories:
|
||||
headline_stories = soup_top_stories.find('ul')
|
||||
headline_stories = soup_top_stories
|
||||
self.log("HAVE top_stories")
|
||||
else:
|
||||
headline_stories = None
|
||||
self.log("NO top_stories")
|
||||
section_lists = soup.findAll('ul')
|
||||
# Prepend the headlines to the first section
|
||||
if headline_stories:
|
||||
section_lists[0].insert(0,headline_stories)
|
||||
section_lists.insert(0,headline_stories)
|
||||
|
||||
sections = []
|
||||
for section in section_lists :
|
||||
@ -133,9 +154,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
|
||||
|
||||
def extract_section_articles(self, sections_html) :
|
||||
# Find the containers with section content
|
||||
soup = self.index_to_soup(str(sections_html))
|
||||
sections = soup.findAll('ul')
|
||||
# Find the containers with section content
|
||||
sections = sections_html
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
@ -145,10 +165,25 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
|
||||
# Get the section name
|
||||
if section.has_key('id') :
|
||||
self.log("PROCESSING SECTION id = %s" % section['id'])
|
||||
key = self.section_dates[i]
|
||||
if key.startswith("Pod"):
|
||||
continue
|
||||
if key.startswith("Blog"):
|
||||
continue
|
||||
articles[key] = []
|
||||
ans.append(key)
|
||||
elif self.slate_complete:
|
||||
key = self.section_dates[i]
|
||||
if key.startswith("Pod"):
|
||||
continue
|
||||
if key.startswith("Blog"):
|
||||
continue
|
||||
self.log("PROCESSING SECTION name = %s" % key)
|
||||
articles[key] = []
|
||||
ans.append(key)
|
||||
else :
|
||||
self.log("SECTION %d HAS NO id" % i);
|
||||
continue
|
||||
|
||||
# Get the section article_list
|
||||
@ -159,8 +194,10 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
bylines = self.tag_to_strings(article)
|
||||
url = article.a['href']
|
||||
title = bylines[0]
|
||||
full_title = self.tag_to_string(article)
|
||||
|
||||
full_title = self.tag_to_string(article,use_alt=False)
|
||||
#self.log("ARTICLE TITLE%s" % title)
|
||||
#self.log("ARTICLE FULL_TITLE%s" % full_title)
|
||||
#self.log("URL %s" % url)
|
||||
author = None
|
||||
description = None
|
||||
pubdate = None
|
||||
@ -191,7 +228,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
|
||||
found_excluded = excluded.search(description)
|
||||
if found_excluded :
|
||||
if self.verbose : self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||
self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||
continue
|
||||
|
||||
# Skip articles whose title contain excluded keywords
|
||||
@ -200,7 +237,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
#self.log("evaluating full_title: %s" % full_title)
|
||||
found_excluded = excluded.search(full_title)
|
||||
if found_excluded :
|
||||
if self.verbose : self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||
self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||
continue
|
||||
|
||||
# Skip articles whose author contain excluded keywords
|
||||
@ -208,7 +245,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
excluded = re.compile('|'.join(self.excludedAuthorKeywords))
|
||||
found_excluded = excluded.search(author)
|
||||
if found_excluded :
|
||||
if self.verbose : self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||
self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||
continue
|
||||
|
||||
skip_this_article = False
|
||||
@ -216,6 +253,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
for article in articles[key] :
|
||||
if article['url'] == url :
|
||||
skip_this_article = True
|
||||
self.log("SKIPPING DUP %s" % url)
|
||||
break
|
||||
|
||||
if skip_this_article :
|
||||
@ -227,6 +265,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
articles[feed] = []
|
||||
articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
|
||||
author=author, content=''))
|
||||
#self.log("KEY %s" % feed)
|
||||
#self.log("APPENDED %s" % url)
|
||||
# Promote 'newspapers' to top
|
||||
for (i,article) in enumerate(articles[feed]) :
|
||||
if article['description'] is not None :
|
||||
@ -235,32 +275,6 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
ans = self.remove_duplicates(ans)
|
||||
return ans
|
||||
|
||||
def flatten_document(self, ans):
|
||||
flat_articles = []
|
||||
for (i,section) in enumerate(ans) :
|
||||
#self.log("flattening section %s: " % section[0])
|
||||
for article in section[1] :
|
||||
#self.log("moving %s to flat_articles[]" % article['title'])
|
||||
flat_articles.append(article)
|
||||
flat_section = ['All Articles', flat_articles]
|
||||
flat_ans = [flat_section]
|
||||
return flat_ans
|
||||
|
||||
def remove_duplicates(self, ans):
|
||||
# Return a stripped ans
|
||||
for (i,section) in enumerate(ans) :
|
||||
#self.log("section %s: " % section[0])
|
||||
for article in section[1] :
|
||||
#self.log("\t%s" % article['title'])
|
||||
#self.log("\looking for %s" % article['url'])
|
||||
for (j,subsequent_section) in enumerate(ans[i+1:]) :
|
||||
for (k,subsequent_article) in enumerate(subsequent_section[1]) :
|
||||
if article['url'] == subsequent_article['url'] :
|
||||
#self.log( "removing %s (%s) from %s" % (subsequent_article['title'], subsequent_article['url'], subsequent_section[0]) )
|
||||
del subsequent_section[1][k]
|
||||
return ans
|
||||
|
||||
def print_version(self, url) :
|
||||
@ -268,13 +282,22 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
|
||||
# Class methods
|
||||
def parse_index(self) :
|
||||
sections = self.extract_sections()
|
||||
if self.slate_complete:
|
||||
sections = self.extract_named_sections()
|
||||
else:
|
||||
sections = self.extract_dated_sections()
|
||||
section_list = self.extract_section_articles(sections)
|
||||
section_list = self.flatten_document(section_list)
|
||||
return section_list
|
||||
|
||||
def get_browser(self) :
|
||||
return BasicNewsRecipe.get_browser()
|
||||
def get_masthead_url(self):
|
||||
masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(masthead)
|
||||
except:
|
||||
self.log("\nMasthead unavailable")
|
||||
masthead = None
|
||||
return masthead
|
||||
|
||||
def stripAnchors(self,soup):
|
||||
body = soup.find('div',attrs={'id':['article_body','content']})
|
||||
@ -304,8 +327,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
excluded = re.compile('|'.join(self.excludedContentKeywords))
|
||||
found_excluded = excluded.search(str(soup))
|
||||
if found_excluded :
|
||||
print "no allowed content found, removing article"
|
||||
raise Exception('String error')
|
||||
print "No allowed content found, removing article"
|
||||
raise Exception('Rejected article')
|
||||
|
||||
# Articles from www.thebigmoney.com use different tagging for byline, dateline and body
|
||||
head = soup.find('head')
|
||||
@ -338,7 +361,6 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
|
||||
if dept_kicker is not None :
|
||||
kicker_strings = self.tag_to_strings(dept_kicker)
|
||||
#kicker = kicker_strings[2] + kicker_strings[3]
|
||||
kicker = ''.join(kicker_strings[2:])
|
||||
kicker = re.sub('\.','',kicker)
|
||||
h3Tag = Tag(soup, "h3")
|
||||
@ -346,25 +368,11 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
||||
emTag.insert(0,NavigableString(kicker))
|
||||
h3Tag.insert(0, emTag)
|
||||
dept_kicker.replaceWith(h3Tag)
|
||||
else:
|
||||
self.log("No kicker--return null")
|
||||
return None
|
||||
|
||||
# Change <h1> to <h2>
|
||||
headline = soup.find("h1")
|
||||
#tag = headline.find("span")
|
||||
#tag.name = 'div'
|
||||
|
||||
if headline is not None :
|
||||
h2tag = Tag(soup, "h2")
|
||||
h2tag['class'] = "headline"
|
||||
strs = self.tag_to_strings(headline)
|
||||
result = ''
|
||||
for (i,substr) in enumerate(strs) :
|
||||
result += substr
|
||||
if i < len(strs) -1 :
|
||||
result += '<br />'
|
||||
#h2tag.insert(0, result)
|
||||
#headline.replaceWith(h2tag)
|
||||
|
||||
# Fix up the concatenated byline and dateline
|
||||
# Fix up the concatenated byline and dateline
|
||||
byline = soup.find(True,attrs={'class':'byline'})
|
||||
if byline is not None :
|
||||
bylineTag = Tag(soup,'div')
|
||||
|
@ -5,15 +5,16 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Timothy Legge <timlegge at gmail.com> and Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import os, time
|
||||
import sqlite3 as sqlite
|
||||
|
||||
from calibre.devices.usbms.books import BookList
|
||||
from calibre.devices.kobo.books import Book
|
||||
from calibre.devices.kobo.books import ImageWrapper
|
||||
from calibre.devices.mime import mime_type_ext
|
||||
from calibre.devices.usbms.driver import USBMS
|
||||
from calibre.devices.usbms.driver import USBMS, debug_print
|
||||
from calibre import prints
|
||||
from calibre.devices.usbms.books import CollectionsBookList
|
||||
|
||||
class KOBO(USBMS):
|
||||
|
||||
@ -21,12 +22,15 @@ class KOBO(USBMS):
|
||||
gui_name = 'Kobo Reader'
|
||||
description = _('Communicate with the Kobo Reader')
|
||||
author = 'Timothy Legge and Kovid Goyal'
|
||||
version = (1, 0, 4)
|
||||
version = (1, 0, 6)
|
||||
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
|
||||
booklist_class = CollectionsBookList
|
||||
|
||||
# Ordered list of supported formats
|
||||
FORMATS = ['epub', 'pdf']
|
||||
CAN_SET_METADATA = True
|
||||
|
||||
VENDOR_ID = [0x2237]
|
||||
PRODUCT_ID = [0x4161]
|
||||
@ -40,6 +44,12 @@ class KOBO(USBMS):
|
||||
|
||||
VIRTUAL_BOOK_EXTENSIONS = frozenset(['kobo'])
|
||||
|
||||
EXTRA_CUSTOMIZATION_MESSAGE = _('The Kobo supports only one collection '
|
||||
'currently: the \"Im_Reading\" list. Create a tag called \"Im_Reading\" ')+\
|
||||
'for automatic management'
|
||||
|
||||
EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(['tags'])
|
||||
|
||||
def initialize(self):
|
||||
USBMS.initialize(self)
|
||||
self.book_class = Book
|
||||
@ -63,6 +73,8 @@ class KOBO(USBMS):
|
||||
self._card_b_prefix if oncard == 'cardb' \
|
||||
else self._main_prefix
|
||||
|
||||
self.booklist_class.rebuild_collections = self.rebuild_collections
|
||||
|
||||
# get the metadata cache
|
||||
bl = self.booklist_class(oncard, prefix, self.settings)
|
||||
need_sync = self.parse_metadata_cache(bl, prefix, self.METADATA_CACHE)
|
||||
@ -85,9 +97,7 @@ class KOBO(USBMS):
|
||||
playlist_map = {}
|
||||
|
||||
if readstatus == 1:
|
||||
if lpath not in playlist_map:
|
||||
playlist_map[lpath] = []
|
||||
playlist_map[lpath].append("I\'m Reading")
|
||||
playlist_map[lpath]= "Im_Reading"
|
||||
|
||||
path = self.normalize_path(path)
|
||||
# print "Normalized FileName: " + path
|
||||
@ -104,14 +114,17 @@ class KOBO(USBMS):
|
||||
if self.update_metadata_item(bl[idx]):
|
||||
# print 'update_metadata_item returned true'
|
||||
changed = True
|
||||
bl[idx].device_collections = playlist_map.get(lpath, [])
|
||||
if lpath in playlist_map and \
|
||||
playlist_map[lpath] not in bl[idx].device_collections:
|
||||
bl[idx].device_collections.append(playlist_map[lpath])
|
||||
else:
|
||||
if ContentType == '6':
|
||||
book = Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=1048576)
|
||||
else:
|
||||
book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID)
|
||||
# print 'Update booklist'
|
||||
book.device_collections = playlist_map.get(book.lpath, [])
|
||||
book.device_collections = [playlist_map[lpath]] if lpath in playlist_map else []
|
||||
|
||||
if bl.add_book(book, replace_metadata=False):
|
||||
changed = True
|
||||
except: # Probably a path encoding error
|
||||
@ -398,3 +411,95 @@ class KOBO(USBMS):
|
||||
size = os.stat(cls.normalize_path(os.path.join(prefix, lpath))).st_size
|
||||
book = Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=size, other=mi)
|
||||
return book
|
||||
|
||||
def get_device_paths(self):
|
||||
paths, prefixes = {}, {}
|
||||
for prefix, path, source_id in [
|
||||
('main', 'metadata.calibre', 0),
|
||||
('card_a', 'metadata.calibre', 1),
|
||||
('card_b', 'metadata.calibre', 2)
|
||||
]:
|
||||
prefix = getattr(self, '_%s_prefix'%prefix)
|
||||
if prefix is not None and os.path.exists(prefix):
|
||||
paths[source_id] = os.path.join(prefix, *(path.split('/')))
|
||||
return paths
|
||||
|
||||
def update_device_database_collections(self, booklists, collections_attributes):
|
||||
# debug_print('Starting update_device_database_collections', collections_attributes)
|
||||
|
||||
# Force collections_attributes to be 'tags' as no other is currently supported
|
||||
# debug_print('KOBO: overriding the provided collections_attributes:', collections_attributes)
|
||||
collections_attributes = ['tags']
|
||||
|
||||
collections = booklists.get_collections(collections_attributes)
|
||||
# debug_print('Collections', collections)
|
||||
for category, books in collections.items():
|
||||
if category == 'Im_Reading':
|
||||
# Create a connection to the sqlite database
|
||||
connection = sqlite.connect(self._main_prefix + '.kobo/KoboReader.sqlite')
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Reset Im_Reading list in the database
|
||||
query= 'update content set ReadStatus=0, FirstTimeReading = \'true\' where BookID is Null'
|
||||
try:
|
||||
cursor.execute (query)
|
||||
except:
|
||||
debug_print('Database Exception: Unable to reset Im_Reading list')
|
||||
raise
|
||||
else:
|
||||
# debug_print('Commit: Reset Im_Reading list')
|
||||
connection.commit()
|
||||
|
||||
for book in books:
|
||||
# debug_print('Title:', book.title, 'lpath:', book.path)
|
||||
book.device_collections = ['Im_Reading']
|
||||
|
||||
extension = os.path.splitext(book.path)[1]
|
||||
ContentType = self.get_content_type_from_extension(extension)
|
||||
|
||||
ContentID = self.contentid_from_path(book.path, ContentType)
|
||||
datelastread = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime())
|
||||
|
||||
t = (datelastread,ContentID,)
|
||||
|
||||
try:
|
||||
cursor.execute('update content set ReadStatus=1,FirstTimeReading=\'false\',DateLastRead=? where BookID is Null and ContentID = ?', t)
|
||||
except:
|
||||
debug_print('Database Exception: Unable create Im_Reading list')
|
||||
raise
|
||||
else:
|
||||
connection.commit()
|
||||
# debug_print('Database: Commit create Im_Reading list')
|
||||
|
||||
cursor.close()
|
||||
connection.close()
|
||||
|
||||
# debug_print('Finished update_device_database_collections', collections_attributes)
|
||||
|
||||
def sync_booklists(self, booklists, end_session=True):
|
||||
# debug_print('KOBO: started sync_booklists')
|
||||
paths = self.get_device_paths()
|
||||
|
||||
blists = {}
|
||||
for i in paths:
|
||||
if booklists[i] is not None:
|
||||
#debug_print('Booklist: ', i)
|
||||
blists[i] = booklists[i]
|
||||
opts = self.settings()
|
||||
if opts.extra_customization:
|
||||
collections = [x.lower().strip() for x in
|
||||
opts.extra_customization.split(',')]
|
||||
else:
|
||||
collections = []
|
||||
|
||||
#debug_print('KOBO: collection fields:', collections)
|
||||
for i, blist in blists.items():
|
||||
self.update_device_database_collections(blist, collections)
|
||||
|
||||
USBMS.sync_booklists(self, booklists, end_session=end_session)
|
||||
#debug_print('KOBO: finished sync_booklists')
|
||||
|
||||
def rebuild_collections(self, booklist, oncard):
|
||||
collections_attributes = []
|
||||
self.update_device_database_collections(booklist, collections_attributes)
|
||||
|
||||
|
@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber):
|
||||
'chapter', 'chapter_mark',
|
||||
'prefer_metadata_cover', 'remove_first_image',
|
||||
'insert_metadata', 'page_breaks_before',
|
||||
'preprocess_html',
|
||||
'preprocess_html', 'html_unwrap_factor',
|
||||
]
|
||||
),
|
||||
|
||||
|
@ -362,6 +362,15 @@ OptionRecommendation(name='preprocess_html',
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='html_unwrap_factor',
|
||||
recommended_value=0.40, level=OptionRecommendation.LOW,
|
||||
help=_('Scale used to determine the length at which a line should '
|
||||
'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
|
||||
'default is 0.40, just below the median line length. This will unwrap typical books '
|
||||
' with hard line breaks, but should be reduced if the line length is variable.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='smarten_punctuation',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Convert plain quotes, dashes and ellipsis to their '
|
||||
|
@ -351,7 +351,7 @@ class HTMLPreProcessor(object):
|
||||
# print "The pdf line length returned is " + str(length)
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
(re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
for rule in self.PREPROCESS + start_rules:
|
||||
|
@ -11,10 +11,11 @@ from calibre.utils.logging import default_log
|
||||
|
||||
class PreProcessor(object):
|
||||
|
||||
def __init__(self, log=None):
|
||||
def __init__(self, log=None, extra_opts=None):
|
||||
self.log = default_log if log is None else log
|
||||
self.html_preprocess_sections = 0
|
||||
self.found_indents = 0
|
||||
self.extra_opts = extra_opts
|
||||
|
||||
def chapter_head(self, match):
|
||||
chap = match.group('chap')
|
||||
@ -91,6 +92,7 @@ class PreProcessor(object):
|
||||
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
|
||||
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
|
||||
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
|
||||
blanklines = blankreg.findall(html)
|
||||
lines = linereg.findall(html)
|
||||
if len(lines) > 1:
|
||||
@ -147,15 +149,16 @@ class PreProcessor(object):
|
||||
format = 'html'
|
||||
|
||||
# Calculate Length
|
||||
length = line_length(format, html, 0.4)
|
||||
length = line_length('pdf', html, getattr(self.extra_opts,
|
||||
'html_unwrap_factor', 0.4))
|
||||
self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
|
||||
#
|
||||
# Unwrap and/or delete soft-hyphens, hyphens
|
||||
html = re.sub(u'\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
|
||||
|
||||
# Unwrap lines using punctation if the median length of all lines is less than 200
|
||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
# Unwrap lines using punctation and line length
|
||||
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
html = unwrap.sub(' ', html)
|
||||
|
||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||
|
@ -12,6 +12,7 @@ from copy import deepcopy
|
||||
from lxml import etree
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
from calibre import guess_type
|
||||
|
||||
class Canvas(etree.XSLTExtension):
|
||||
@ -419,4 +420,9 @@ class LRFInput(InputFormatPlugin):
|
||||
styles.write()
|
||||
return os.path.abspath('content.opf')
|
||||
|
||||
def preprocess_html(self, html):
|
||||
preprocessor = PreProcessor(log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
||||
|
||||
|
||||
|
@ -26,8 +26,10 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
||||
'remove_first_image',
|
||||
'insert_metadata', 'page_breaks_before',
|
||||
'preprocess_html', 'remove_header', 'header_regex',
|
||||
'remove_footer', 'footer_regex']
|
||||
'remove_footer', 'footer_regex','html_unwrap_factor']
|
||||
)
|
||||
self.opt_html_unwrap_factor.setEnabled(False)
|
||||
self.huf_label.setEnabled(False)
|
||||
self.db, self.book_id = db, book_id
|
||||
for x in ('pagebreak', 'rule', 'both', 'none'):
|
||||
self.opt_chapter_mark.addItem(x)
|
||||
@ -64,3 +66,8 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
||||
_('The XPath expression %s is invalid.')%x.text).exec_()
|
||||
return False
|
||||
return True
|
||||
|
||||
def set_value_handler(self, g, val):
|
||||
if val is None and g is self.opt_html_unwrap_factor:
|
||||
g.setValue(0.0)
|
||||
return True
|
||||
|
@ -14,10 +14,10 @@
|
||||
<string>Form</string>
|
||||
</property>
|
||||
<layout class="QGridLayout" name="gridLayout">
|
||||
<item row="0" column="0" colspan="2">
|
||||
<item row="0" column="1" colspan="2">
|
||||
<widget class="XPathEdit" name="opt_chapter" native="true"/>
|
||||
</item>
|
||||
<item row="1" column="0">
|
||||
<item row="1" column="0" colspan="2">
|
||||
<widget class="QLabel" name="label">
|
||||
<property name="text">
|
||||
<string>Chapter &mark:</string>
|
||||
@ -27,31 +27,31 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="1" column="1">
|
||||
<item row="1" column="2">
|
||||
<widget class="QComboBox" name="opt_chapter_mark">
|
||||
<property name="minimumContentsLength">
|
||||
<number>20</number>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="0">
|
||||
<item row="2" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_remove_first_image">
|
||||
<property name="text">
|
||||
<string>Remove first &image</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="4" column="0">
|
||||
<item row="5" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_insert_metadata">
|
||||
<property name="text">
|
||||
<string>Insert &metadata as page at start of book</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="10" column="0" colspan="2">
|
||||
<item row="11" column="0" colspan="3">
|
||||
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
|
||||
</item>
|
||||
<item row="11" column="0" colspan="2">
|
||||
<item row="12" column="0" colspan="3">
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
@ -64,27 +64,66 @@
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="7" column="0">
|
||||
<item row="8" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_remove_footer">
|
||||
<property name="text">
|
||||
<string>Remove F&ooter</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="5" column="0">
|
||||
<item row="6" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_remove_header">
|
||||
<property name="text">
|
||||
<string>Remove H&eader</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="0" colspan="2">
|
||||
<item row="7" column="0" colspan="3">
|
||||
<widget class="RegexEdit" name="opt_header_regex" native="true"/>
|
||||
</item>
|
||||
<item row="8" column="0" colspan="2">
|
||||
<item row="9" column="0" colspan="3">
|
||||
<widget class="RegexEdit" name="opt_footer_regex" native="true"/>
|
||||
</item>
|
||||
<item row="3" column="0">
|
||||
<item row="4" column="1">
|
||||
<widget class="QLabel" name="huf_label">
|
||||
<property name="text">
|
||||
<string>Line &un-wrap factor during preprocess:</string>
|
||||
</property>
|
||||
<property name="buddy">
|
||||
<cstring>opt_html_unwrap_factor</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="4" column="2">
|
||||
<widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
|
||||
<property name="toolTip">
|
||||
<string/>
|
||||
</property>
|
||||
<property name="maximum">
|
||||
<double>1.000000000000000</double>
|
||||
</property>
|
||||
<property name="singleStep">
|
||||
<double>0.050000000000000</double>
|
||||
</property>
|
||||
<property name="value">
|
||||
<double>0.400000000000000</double>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="4" column="0">
|
||||
<spacer name="horizontalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Horizontal</enum>
|
||||
</property>
|
||||
<property name="sizeHint" stdset="0">
|
||||
<size>
|
||||
<width>40</width>
|
||||
<height>20</height>
|
||||
</size>
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="3" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_preprocess_html">
|
||||
<property name="text">
|
||||
<string>&Preprocess input file to possibly improve structure detection</string>
|
||||
@ -108,5 +147,38 @@
|
||||
</customwidget>
|
||||
</customwidgets>
|
||||
<resources/>
|
||||
<connections/>
|
||||
<connections>
|
||||
<connection>
|
||||
<sender>opt_preprocess_html</sender>
|
||||
<signal>toggled(bool)</signal>
|
||||
<receiver>opt_html_unwrap_factor</receiver>
|
||||
<slot>setEnabled(bool)</slot>
|
||||
<hints>
|
||||
<hint type="sourcelabel">
|
||||
<x>328</x>
|
||||
<y>87</y>
|
||||
</hint>
|
||||
<hint type="destinationlabel">
|
||||
<x>481</x>
|
||||
<y>113</y>
|
||||
</hint>
|
||||
</hints>
|
||||
</connection>
|
||||
<connection>
|
||||
<sender>opt_preprocess_html</sender>
|
||||
<signal>toggled(bool)</signal>
|
||||
<receiver>huf_label</receiver>
|
||||
<slot>setEnabled(bool)</slot>
|
||||
<hints>
|
||||
<hint type="sourcelabel">
|
||||
<x>295</x>
|
||||
<y>88</y>
|
||||
</hint>
|
||||
<hint type="destinationlabel">
|
||||
<x>291</x>
|
||||
<y>105</y>
|
||||
</hint>
|
||||
</hints>
|
||||
</connection>
|
||||
</connections>
|
||||
</ui>
|
||||
|
@ -6,10 +6,7 @@ The dialog used to edit meta information for a book as well as
|
||||
add/remove formats
|
||||
'''
|
||||
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import traceback
|
||||
import os, re, time, traceback, textwrap
|
||||
|
||||
from PyQt4.Qt import SIGNAL, QObject, Qt, QTimer, QThread, QDate, \
|
||||
QPixmap, QListWidgetItem, QDialog, pyqtSignal
|
||||
@ -331,6 +328,14 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
|
||||
ResizableDialog.__init__(self, window)
|
||||
self.bc_box.layout().setAlignment(self.cover, Qt.AlignCenter|Qt.AlignHCenter)
|
||||
self.cancel_all = False
|
||||
base = unicode(self.author_sort.toolTip())
|
||||
self.ok_aus_tooltip = '<p>' + textwrap.fill(base+'<br><br>'+
|
||||
_(' The green color indicates that the current '
|
||||
'author sort matches the current author'))
|
||||
self.bad_aus_tooltip = '<p>'+textwrap.fill(base + '<br><br>'+
|
||||
_(' The red color indicates that the current '
|
||||
'author sort does not match the current author'))
|
||||
|
||||
if cancel_all:
|
||||
self.__abort_button = self.button_box.addButton(self.button_box.Abort)
|
||||
self.__abort_button.setToolTip(_('Abort the editing of all remaining books'))
|
||||
@ -490,6 +495,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
|
||||
col = 'rgb(255, 0, 0, 20%)'
|
||||
self.author_sort.setStyleSheet('QLineEdit { color: black; '
|
||||
'background-color: %s; }'%col)
|
||||
tt = self.ok_aus_tooltip if normal else self.bad_aus_tooltip
|
||||
self.author_sort.setToolTip(tt)
|
||||
|
||||
def validate_isbn(self, isbn):
|
||||
isbn = unicode(isbn).strip()
|
||||
|
@ -329,6 +329,17 @@ There are a few more options in this section.
|
||||
of as a separate cover. If you also specify a cover in |app|, then the converted book will have
|
||||
two covers. This option will simply remove the first image from the source document, thereby
|
||||
ensuring that the converted book has only one cover, the one specified in |app|.
|
||||
|
||||
:guilabel:`Preprocess input`
|
||||
This option activates various algorithms that try to detect and correct common cases of
|
||||
badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
|
||||
Turn this option on if your input document suffers from bad formatting. But be aware that in
|
||||
some cases, this option can lead to worse results, so use with care.
|
||||
|
||||
:guilabel:`Line-unwrap factor`
|
||||
This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
|
||||
option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
|
||||
than the length of 40% of all lines in the document.
|
||||
|
||||
Table of Contents
|
||||
------------------
|
||||
|
Loading…
x
Reference in New Issue
Block a user