Merge from trunk

This commit is contained in:
Charles Haley 2011-09-25 09:01:33 +02:00
commit b2dceb1a7a
4 changed files with 63 additions and 25 deletions

View File

@ -1,6 +1,3 @@
'''
dnaindia.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
@ -12,6 +9,10 @@ class DNAIndia(BasicNewsRecipe):
language = 'en_IN'
encoding = 'cp1252'
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
('Top News', 'http://www.dnaindia.com/syndication/rss_topnews.xml'),
@ -22,15 +23,10 @@ class DNAIndia(BasicNewsRecipe):
('World', 'http://www.dnaindia.com/syndication/rss,catid-9.xml'),
('Money', 'http://www.dnaindia.com/syndication/rss,catid-4.xml'),
('Sports', 'http://www.dnaindia.com/syndication/rss,catid-6.xml'),
('After Hours', 'http://www.dnaindia.com/syndication/rss,catid-7.xml'),
('Digital Life', 'http://www.dnaindia.com/syndication/rss,catid-1089741.xml'),
('After Hours', 'http://www.dnaindia.com/syndication/rss,catid-7.xml')
]
remove_tags = [{'id':['footer', 'lhs-col']}, {'class':['bottom', 'categoryHead',
'article_tools']}]
keep_only_tags = dict(id='middle-col')
remove_tags_after=[dict(attrs={'id':'story'})]
remove_attributes=['style']
no_stylesheets = True
def print_version(self, url):
match = re.search(r'newsid=(\d+)', url)

View File

@ -9,11 +9,16 @@ class TimesOfIndia(BasicNewsRecipe):
max_articles_per_feed = 25
no_stylesheets = True
keep_only_tags = [{'class':['maintable12', 'prttabl']}]
remove_attributes = ['style']
keep_only_tags = [
{'class':re.compile(r'maintable12|prttabl')},
{'id':['mod-article-header',
'mod-a-body-after-first-para', 'mod-a-body-first-para']},
]
remove_tags = [
dict(style=lambda x: x and 'float' in x),
{'class':['prvnxtbg', 'footbdrin', 'bcclftr']},
]
{'class':re.compile('tabsintbgshow|prvnxtbg')},
{'id':['fbrecommend', 'relmaindiv']}
]
feeds = [
('Top Stories',
@ -41,6 +46,8 @@ class TimesOfIndia(BasicNewsRecipe):
]
def get_article_url(self, article):
# Times of India sometimes serves an ad page instead of the article,
# this code, detects and circumvents that
url = BasicNewsRecipe.get_article_url(self, article)
if '/0Ltimesofindia' in url:
url = url.partition('/0L')[-1]
@ -61,6 +68,3 @@ class TimesOfIndia(BasicNewsRecipe):
return url
def preprocess_html(self, soup):
return soup

View File

@ -73,6 +73,20 @@ class Worker(Thread): # Get details {{{
8: ['août'],
9: ['sept'],
12: ['déc'],
},
'es': {
1: ['enero'],
2: ['febrero'],
3: ['marzo'],
4: ['abril'],
5: ['mayo'],
6: ['junio'],
7: ['julio'],
8: ['agosto'],
9: ['septiembre', 'setiembre'],
10: ['octubre'],
11: ['noviembre'],
12: ['diciembre'],
},
'jp': {
1: [u'1月'],
@ -101,13 +115,16 @@ class Worker(Thread): # Get details {{{
text()="Dettagli prodotto" or \
text()="Product details" or \
text()="Détails sur le produit" or \
text()="Detalles del producto" or \
text()="登録情報"]/../div[@class="content"]
'''
# Editor: is for Spanish
self.publisher_xpath = '''
descendant::*[starts-with(text(), "Publisher:") or \
starts-with(text(), "Verlag:") or \
starts-with(text(), "Editore:") or \
starts-with(text(), "Editeur") or \
starts-with(text(), "Editor:") or \
starts-with(text(), "出版社:")]
'''
self.language_xpath = '''
@ -116,12 +133,14 @@ class Worker(Thread): # Get details {{{
or text() = "Language" \
or text() = "Sprache:" \
or text() = "Lingua:" \
or text() = "Idioma:" \
or starts-with(text(), "Langue") \
or starts-with(text(), "言語") \
]
'''
self.ratings_pat = re.compile(
r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち) ([\d\.]+)( (stars|Sternen|stelle)){0,1}')
r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち|de un máximo de) ([\d\.]+)( (stars|Sternen|stelle|estrellas)){0,1}')
lm = {
'eng': ('English', 'Englisch'),
@ -143,6 +162,7 @@ class Worker(Thread): # Get details {{{
for i, vals in self.months.iteritems():
for x in vals:
ans = ans.replace(x, self.english_months[i])
ans = ans.replace(' de ', ' ')
return ans
def run(self):
@ -422,6 +442,7 @@ class Amazon(Source):
'uk' : _('UK'),
'it' : _('Italy'),
'jp' : _('Japan'),
'es' : _('Spain'),
}
options = (
@ -789,6 +810,16 @@ if __name__ == '__main__': # tests {{{
),
] # }}}
es_tests = [ # {{{
(
{'identifiers':{'isbn': '8483460831'}},
[title_test('Tiempos Interesantes',
exact=True), authors_test(['Terry Pratchett'])
]
),
] # }}}
jp_tests = [ # {{{
( # isbn -> title, authors
{'identifiers':{'isbn': '9784101302720' }},
@ -804,6 +835,6 @@ if __name__ == '__main__': # tests {{{
] # }}}
test_identify_plugin(Amazon.name, com_tests)
#test_identify_plugin(Amazon.name, jp_tests)
#test_identify_plugin(Amazon.name, es_tests)
# }}}

View File

@ -173,6 +173,10 @@ class PDFWriter(QObject): # {{{
printer.setOutputFormat(QPrinter.NativeFormat)
self.view.print_(printer)
printer.abort()
else:
# The document is so corrupt that we can't render the page.
self.loop.exit(0)
raise Exception('Document cannot be rendered.')
self._render_book()
def _delete_tmpdir(self):
@ -207,11 +211,14 @@ class PDFWriter(QObject): # {{{
try:
outPDF = PdfFileWriter(title=self.metadata.title, author=self.metadata.author)
for item in self.combine_queue:
with open(item, 'rb') as item_stream:
inputPDF = PdfFileReader(item_stream)
for page in inputPDF.pages:
outPDF.addPage(page)
outPDF.write(self.out_stream)
# The input PDF stream must remain open until the final PDF
# is written to disk. PyPDF references pages added to the
# final PDF from the input PDF on disk. It does not store
# the pages in memory so we can't close the input PDF.
inputPDF = PdfFileReader(open(item, 'rb'))
for page in inputPDF.pages:
outPDF.addPage(page)
outPDF.write(self.out_stream)
finally:
self._delete_tmpdir()
self.loop.exit(0)