0.8.27+, GwR patches for 875726, 892468

This commit is contained in:
GRiker 2011-11-19 09:04:12 -07:00
commit dd8960eeac
16 changed files with 102 additions and 82 deletions

View File

@ -1,11 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class AdvancedUserRecipe(BasicNewsRecipe):
title = 'heise online'
title = 'Heise-online'
description = 'News vom Heise-Verlag'
__author__ = 'schuster'
masthead_url = 'http://www.heise.de/icons/ho/heise_online_logo.gif'
publisher = 'Heise Zeitschriften Verlag GmbH & Co. KG'
use_embedded_content = False
language = 'de'
oldest_article = 2
@ -14,11 +14,10 @@ class AdvancedUserRecipe(BasicNewsRecipe):
remove_empty_feeds = True
timeout = 5
no_stylesheets = True
encoding = 'utf-8'
remove_tags_after = dict(name ='p', attrs={'class':'editor'})
remove_tags = [{'class':'navi_top_container'},
remove_tags = [dict(id='navi_top_container'),
dict(id='navi_bottom'),
dict(id='mitte_rechts'),
dict(id='navigation'),
@ -29,27 +28,31 @@ class AdvancedUserRecipe(BasicNewsRecipe):
dict(id='seiten_navi'),
dict(id='adbottom'),
dict(id='sitemap'),
dict(name='a', href=re.compile(r'^/([a-zA-Z]+/)?')),
]
dict(name='div', attrs={'id':'sitemap'}),
dict(name='ul', attrs={'class':'erste_zeile'}),
dict(name='ul', attrs={'class':'zweite_zeile'}),
dict(name='div', attrs={'class':'navi_top_container'})]
feeds = [
('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'),
('iX', 'http://www.heise.de/ix/news/news.rdf'),
('Technology Review', 'http://www.heise.de/tr/news-atom.xml'),
('mobil', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
('Security', 'http://www.heise.de/security/news/news-atom.xml'),
('Netze', 'http://www.heise.de/netze/rss/netze-atom.xml'),
('Open Source', 'http://www.heise.de/open/news/news-atom.xml'),
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
('Auto', 'http://www.heise.de/autos/rss/news.rdf'),
('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'),
('Autos', 'http://www.heise.de/autos/rss/news.rdf'),
('Mac & i', 'http://www.heise.de/mac-and-i/news.rdf'),
('Mac&i', 'http://www.heise.de/mac-and-i/news.rdf'),
('Mobile ', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
('Netz ', 'http://www.heise.de/netze/rss/netze-atom.xml'),
('Open ', 'http://www.heise.de/open/news/news-atom.xml'),
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
('Security ', 'http://www.heise.de/security/news/news-atom.xml'),
('C`t', 'http://www.heise.de/ct/rss/artikel-atom.xml'),
('iX', 'http://www.heise.de/ix/news/news.rdf'),
('Mach-flott', 'http://www.heise.de/mach-flott/rss/mach-flott-atom.xml'),
('Blog: Babel-Bulletin', 'http://www.heise.de/developer/rss/babel-bulletin/blog.rdf'),
('Blog: Der Dotnet-Doktor', 'http://www.heise.de/developer/rss/dotnet-doktor/blog.rdf'),
('Blog: Bernds Management-Welt', 'http://www.heise.de/developer/rss/bernds-management-welt/blog.rdf'),
('Blog: The World of IT', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')
]
('Blog: IT conversation', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')]
def print_version(self, url):
return url + '?view=print'

View File

@ -5,7 +5,6 @@ www.theweek.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
import re
class TheWeek(BasicNewsRecipe):
title = 'The Week Magazine'
@ -21,23 +20,7 @@ class TheWeek(BasicNewsRecipe):
encoding = 'utf-8'
use_embedded_content = False
language = 'en'
preprocess_regexps = [(re.compile(r'<h3><a href=.*</body>', re.DOTALL), lambda match: '</body>')]
remove_tags_before = dict(name='h1')
remove_tags_after = dict(name='div', attrs={'class':'articleSubscribe4free'})
remove_tags = [
dict(name='div', attrs={'class':['floatLeft','imageCaption','slideshowImageAttribution','postDate','utilities','cartoonInfo','left','middle','col300','articleSubscribe4free',' articleFlyout','articleFlyout floatRight','fourFreeBar']})
,dict(name='div', attrs={'id':['cartoonThumbs','rightColumn','header','partners']})
,dict(name='ul', attrs={'class':['slideshowNav','hotTopicsList topicList']})
]
remove_attributes = ['width','height', 'style', 'font', 'color']
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
p {font-family:Arial,Helvetica,sans-serif;}
'''
filter_regexps = [r'www\.palmcoastdata\.com']
auto_cleanup = True
feeds = [
(u'News-Opinion', u'http://theweek.com/section/index/news_opinion.rss'),
(u'Business', u'http://theweek.com/section/index/business.rss'),

View File

@ -38,10 +38,12 @@
<hr class="cbj_kindle_banner_hr" />
<!--
In addition you can add code to show the values of custom columns here.
The value is available as _column_name and the title as _column_name_label.
For example, if you have a custom column with label #genre, you can add it to
this template with:
<div>{_genre_label}: {_genre}</div>
The value is available as _column_name and the title as
_column_name_label. For example, if you have a custom column with
label #genre, you can add it to this template with _genre_label and
_genre. Note that the # is replaced by an underscore. For example
<div><b>{_genre_label}:</b> {_genre}</div>
-->
<div class="cbj_comments">{comments}</div>

View File

@ -33,7 +33,7 @@ class IREXDR1000(USBMS):
MAIN_MEMORY_VOLUME_LABEL = 'IRex Digital Reader 1000 Main Memory'
EBOOK_DIR_MAIN = 'ebooks'
EBOOK_DIR_MAIN = ''
DELETE_EXTS = ['.mbp']
SUPPORTS_SUB_DIRS = True
@ -44,7 +44,7 @@ class IREXDR800(IREXDR1000):
WINDOWS_MAIN_MEM = 'DR800'
FORMATS = ['epub', 'pdb', 'html', 'pdf', 'txt']
EBOOK_DIR_MAIN = 'Books'
EBOOK_DIR_MAIN = ''
DELETE_EXTS = []
SUPPORTS_SUB_DIRS = True

View File

@ -388,13 +388,9 @@ class KINDLE_FIRE(KINDLE2):
EBOOK_DIR_MAIN = 'Documents'
SUPPORTS_SUB_DIRS = False
SCAN_FROM_ROOT = True
SUPPORTS_SUB_DIRS_FOR_SCAN = True
VENDOR_NAME = 'AMAZON'
WINDOWS_MAIN_MEM = 'KINDLE'
def get_main_ebook_dir(self, for_upload=False):
if for_upload:
return self.EBOOK_DIR_MAIN
return ''

View File

@ -81,7 +81,7 @@ class NOOK(USBMS):
return [x.replace('#', '_') for x in components]
class NOOK_COLOR(NOOK):
description = _('Communicate with the Nook Color and TSR eBook readers.')
description = _('Communicate with the Nook Color, TSR and Tablet eBook readers.')
PRODUCT_ID = [0x002, 0x003, 0x004]
BCD = [0x216]

View File

@ -28,6 +28,8 @@ class DeviceConfig(object):
EXTRA_CUSTOMIZATION_DEFAULT = None
SUPPORTS_SUB_DIRS = False
SUPPORTS_SUB_DIRS_FOR_SCAN = False # This setting is used when scanning for
# books when SUPPORTS_SUB_DIRS is False
MUST_READ_METADATA = False
SUPPORTS_USE_AUTHOR_SORT = False

View File

@ -202,7 +202,7 @@ class USBMS(CLI, Device):
debug_print('USBMS: scan from root', self.SCAN_FROM_ROOT, ebook_dir)
if not os.path.exists(ebook_dir): continue
# Get all books in the ebook_dir directory
if self.SUPPORTS_SUB_DIRS:
if self.SUPPORTS_SUB_DIRS or self.SUPPORTS_SUB_DIRS_FOR_SCAN:
# build a list of files to check, so we can accurately report progress
flist = []
for path, dirs, files in os.walk(ebook_dir):

View File

@ -710,7 +710,8 @@ class Metadata(object):
fmt('Title sort', self.title_sort)
if self.authors:
fmt('Author(s)', authors_to_string(self.authors) + \
((' [' + self.author_sort + ']') if self.author_sort else ''))
((' [' + self.author_sort + ']')
if self.author_sort and self.author_sort != _('Unknown') else ''))
if self.publisher:
fmt('Publisher', self.publisher)
if getattr(self, 'book_producer', False):

View File

@ -6,11 +6,12 @@ Created on 4 Jun 2010
from base64 import b64encode, b64decode
import json, traceback
from datetime import datetime, time
from calibre.ebooks.metadata.book import SERIALIZABLE_FIELDS
from calibre.constants import filesystem_encoding, preferred_encoding
from calibre.library.field_metadata import FieldMetadata
from calibre.utils.date import parse_date, isoformat, UNDEFINED_DATE
from calibre.utils.date import parse_date, isoformat, UNDEFINED_DATE, local_tz
from calibre.utils.magick import Image
from calibre import isbytestring
@ -22,7 +23,13 @@ def string_to_datetime(src):
return parse_date(src)
def datetime_to_string(dateval):
if dateval is None or dateval == UNDEFINED_DATE:
if dateval is None:
return "None"
if not isinstance(dateval, datetime):
dateval = datetime.combine(dateval, time())
if hasattr(dateval, 'tzinfo') and dateval.tzinfo is None:
dateval = dateval.replace(tzinfo=local_tz)
if dateval <= UNDEFINED_DATE:
return "None"
return isoformat(dateval)

View File

@ -11,7 +11,7 @@ import datetime
from urllib import quote_plus
from Queue import Queue, Empty
from lxml import etree, html
from calibre import as_unicode
from calibre import prints, as_unicode
from calibre.ebooks.chardet import xml_to_unicode
@ -54,7 +54,8 @@ class Ozon(Source):
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
# div_book -> search only books, ebooks and audio books
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
# for ozon.ru search we have to format ISBN with '-'
isbn = _format_isbn(log, identifiers.get('isbn', None))
# TODO: format isbn!
qItems = set([isbn, title])
@ -64,7 +65,7 @@ class Ozon(Source):
qItems.discard('')
qItems = map(_quoteString, qItems)
q = ' '.join(qItems).strip()
q = u' '.join(qItems).strip()
log.info(u'search string: ' + q)
if isinstance(q, unicode):
@ -78,13 +79,13 @@ class Ozon(Source):
return search_url
# }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30):
def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=30): # {{{
if not self.is_configured():
return
query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
if not query:
err = 'Insufficient metadata to construct query'
err = u'Insufficient metadata to construct query'
log.error(err)
return err
@ -109,7 +110,7 @@ class Ozon(Source):
# }}}
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
# some book titles have extra charactes like this
# some book titles have extra characters like this
# TODO: make a twick
reRemoveFromTitle = None
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
@ -160,7 +161,7 @@ class Ozon(Source):
mi.source_relevance = i
if ensure_metadata_match(mi):
metadata.append(mi)
# log.debug(u'added metadata %s %s. '%(mi.title, mi.authors))
#log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
else:
log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors))
return metadata
@ -285,12 +286,12 @@ class Ozon(Source):
url = self.get_book_url(metadata.get_identifiers())[2]
raw = self.browser.open_novisit(url, timeout=timeout).read()
doc = html.fromstring(raw)
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
# series
# series Серия/Серии
xpt = xpt_prod_det_at % u'Сери'
# % u'Серия:'
series = doc.xpath(xpt)
@ -300,7 +301,7 @@ class Ozon(Source):
xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
isbn_str = doc.xpath(xpt)
if isbn_str:
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
if all_isbns:
metadata.all_isbns = all_isbns
metadata.isbn = all_isbns[0]
@ -333,10 +334,10 @@ class Ozon(Source):
xpt = u'//table[@id="detail_description"]//tr/td'
comment_elem = doc.xpath(xpt)
if comment_elem:
comments = unicode(etree.tostring(comment_elem[0]))
comments = unicode(etree.tostring(comment_elem[0], encoding=unicode))
if comments:
# cleanup root tag, TODO: remove tags like object/embeded
comments = re.sub(r'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
comments = re.sub(ur'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
metadata.comments = comments
else:
@ -345,8 +346,16 @@ class Ozon(Source):
log.debug('No book description found in HTML')
# }}}
def _quoteString(str): # {{{
return '"' + str + '"' if str and str.find(' ') != -1 else str
def _quoteString(strToQuote): # {{{
return '"' + strToQuote + '"' if strToQuote and strToQuote.find(' ') != -1 else strToQuote
# }}}
def _verifyISBNIntegrity(log, isbn): # {{{
# Online ISBN-Check http://www.isbn-check.de/
res = check_isbn(isbn)
if not res:
log.error(u'ISBN integrity check failed for "%s"'%isbn)
return res is not None
# }}}
# TODO: make customizable
@ -438,7 +447,7 @@ def _normalizeAuthorNameWithInitials(name): # {{{
return res
# }}}
def toPubdate(log, yearAsString):
def toPubdate(log, yearAsString): # {{{
res = None
if yearAsString:
try:
@ -448,7 +457,11 @@ def toPubdate(log, yearAsString):
except:
log.error('cannot parse to date %s'%yearAsString)
return res
# }}}
def _listToUnicodePrintStr(lst): # {{{
return u'[' + u', '.join(unicode(x) for x in lst) + u']'
# }}}
if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py

View File

@ -16,6 +16,7 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML
from calibre.library.comments import comments_to_html
from calibre.utils.date import is_date_undefined
from calibre.ebooks.chardet import strip_encoding_declarations
JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]'
@ -175,15 +176,20 @@ def render_jacket(mi, output_profile,
try:
display_name, val = mi.format_field_extended(key)[:2]
key = key.replace('#', '_')
args[key] = val
args[key+'_label'] = display_name
args[key] = escape(val)
args[key+'_label'] = escape(display_name)
except:
pass
# Used in the comment describing use of custom columns in templates
args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
args['_genre'] = args.get('_genre', '{_genre}')
generated_html = P('jacket/template.xhtml',
data=True).decode('utf-8').format(**args)
# Post-process the generated html to strip out empty header items
soup = BeautifulSoup(generated_html)
if not series:
series_tag = soup.find(attrs={'class':'cbj_series'})
@ -206,7 +212,8 @@ def render_jacket(mi, output_profile,
if hr_tag is not None:
hr_tag.extract()
return soup.renderContents(None)
return strip_encoding_declarations(
soup.renderContents('utf-8').decode('utf-8'))
from calibre.ebooks.oeb.base import RECOVER_PARSER

View File

@ -372,13 +372,13 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
self.apply_pubdate.setChecked(True)
def clear_pubdate(self, *args):
self.pubdate.setMinimumDateTime(UNDEFINED_QDATETIME)
self.pubdate.setDateTime(UNDEFINED_QDATETIME)
def do_apply_adddate(self, *args):
self.apply_adddate.setChecked(True)
def clear_adddate(self, *args):
self.adddate.setMinimumDateTime(UNDEFINED_QDATETIME)
self.adddate.setDateTime(UNDEFINED_QDATETIME)
def button_clicked(self, which):
if which == self.button_box.button(QDialogButtonBox.Apply):

View File

@ -77,7 +77,8 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
result = False
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
raw = xml_to_unicode(f.read(), verbose=True)[0]
doc = html.fromstring(raw)
# example where we are going to find formats
# <div class="l">
@ -88,7 +89,7 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
# <div class="l">
# <p>.epub, .fb2.zip, .pdf</p>
# </div>
xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
xpt = u'normalize-space(//div[contains(@id, "saleBlock")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
formats = doc.xpath(xpt)
if formats:
result = True

View File

@ -12539,7 +12539,7 @@ msgstr "За&грузить метаданные"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:226
msgid "Configure download metadata"
msgstr ""
msgstr "Настроить загрузку метаданных"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:230
msgid "Change how calibre downloads metadata"
@ -12595,7 +12595,7 @@ msgstr "&Пользовательские метаданные"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:788
msgid "&Comments"
msgstr "Комментарии"
msgstr "&Комментарии"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:854
msgid "Basic metadata"
@ -12603,11 +12603,11 @@ msgstr "Основные метаданные"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
msgid "Has cover"
msgstr "Есть обложка"
msgstr "Обложка"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
msgid "Has summary"
msgstr ""
msgstr "Аннотация"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:190
msgid ""
@ -12619,7 +12619,7 @@ msgstr ""
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:268
msgid "See at"
msgstr ""
msgstr "Посмотреть на"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:403
msgid "calibre is downloading metadata from: "

View File

@ -291,6 +291,11 @@ def clean_date_for_sort(dt, format):
if not isinstance(dt, datetime):
dt = datetime.combine(dt, time())
if hasattr(dt, 'tzinfo'):
if dt.tzinfo is None:
dt = dt.replace(tzinfo=_local_tz)
dt = as_local_time(dt)
if format == 'iso':
format = 'yyMdhms'