Merge from trunk

This commit is contained in:
Charles Haley 2011-11-20 12:10:07 +01:00
commit 5b9ec88d3a
11 changed files with 228 additions and 112 deletions

View File

@ -1,35 +1,43 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Gerardo Diez' __copyright__ = '5, January 2011 Gerardo Diez<gerardo.diez.garcia@gmail.com> & desUBIKado'
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>' __author__ = 'desUBIKado, based on an earlier version by Gerardo Diez'
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)' __version__ = 'v1.01'
__docformat__ = 'restructuredtext en' __date__ = '13, November 2011'
''' '''
expansion.es [url]http://www.expansion.com/[/url]
''' '''
import time
import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class Publico(BasicNewsRecipe):
title =u'Expansion.com' class expansion_spanish(BasicNewsRecipe):
__author__ ='Gerardo Diez' __author__ ='Gerardo Diez & desUBIKado'
publisher =u'Unidad Editorial Información Económica, S.L.' description ='Financial news from Spain'
category ='finances, catalunya' title =u'Expansion'
oldest_article =1 publisher =u'Unidad Editorial Internet, S.L.'
category ='news, finances, Spain'
oldest_article = 2
simultaneous_downloads = 10
max_articles_per_feed =100 max_articles_per_feed =100
simultaneous_downloads =10 timefmt = '[%a, %d %b, %Y]'
cover_url =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png' encoding ='iso-8859-15'
timefmt ='[%A, %d %B, %Y]'
encoding ='latin'
language ='es' language ='es'
remove_javascript =True use_embedded_content = False
no_stylesheets =True remove_javascript = True
no_stylesheets = True
remove_empty_feeds = True
keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']}) keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']})
remove_tags =[ remove_tags =[
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}), dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto','tit_relacionadas','interact','paginacion estirar','sumario derecha']}),
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}), dict(name='ul', attrs={'class':['bolos_desarrollo_noticia','not_logged']}),
dict(name='span', attrs={'class':['comentarios']}), dict(name='span', attrs={'class':['comentarios']}),
dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}), dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
dict(name='div', attrs={'id':['comentarios_lectores_listado']}) dict(name='div', attrs={'id':['comentarios_lectores_listado','comentar']})
] ]
feeds =[ feeds =[
(u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'), (u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
@ -38,42 +46,112 @@ class Publico(BasicNewsRecipe):
(u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'), (u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
(u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'), (u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
(u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'), (u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
(u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'), (u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
(u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'), (u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
(u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'), (u'Cr\xe9ditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
(u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'), (u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
(u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'), (u'Fondos de Inversi\xf3n', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
(u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'), (u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
(u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'), (u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
(u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'), (u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
(u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'), (u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
(u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'), (u'Energ\xeda', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
(u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'), (u'Inmobiliario y Construcci\xf3n', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
(u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'), (u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
(u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'), (u'Automoci\xf3n e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
(u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'), (u'Distribuci\xf3n', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
(u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'), (u'Deporte y Negocio', u' [url]http://estaticos.expansion.com/rss/empresasdeporte.xml[/url]'),
(u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'), (u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
(u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'), (u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
(u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'), (u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
(u'Portada: Econom\xeda y Pol\xedtica', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
(u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'), (u'Pol\xedtica', u'http://estaticos.expansion.com/rss/economia.xml'),
(u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
(u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'), (u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
(u'Portada: Opini\xf3n', u'http://estaticos.expansion.com/rss/opinion.xml'),
(u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
(u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'), (u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
(u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'), (u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
(u'Portada: Jur\xeddico', u'http://estaticos.expansion.com/rss/juridico.xml'),
(u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
(u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'), (u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
(u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'), (u'Opini\xf3n', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
(u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'), (u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
(u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'), (u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
(u'Catalu&ntilde;a', u'http://estaticos.expansion.com/rss/catalunya.xml'), (u'Catalu\xf1a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
(u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml') (u'Funci\xf3n p\xfablica', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
] ]
# Obtener la imagen de portada
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
#[url]http://img5.kiosko.net/2011/11/14/es/expansion.750.jpg[/url]
cover='http://img5.kiosko.net/'+ year + '/' + month + '/' + day +'/es/expansion.750.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nPortada no disponible")
cover ='http://www.aproahp.org/enlaces/images/diario_expansion.gif'
return cover
# Para que no salte la publicidad al recuperar la noticia, y que siempre se recupere
# la página web, mando la variable "t" con la hora "linux" o "epoch" actual
# haciendole creer al sitio web que justo se acaba de ver la publicidad
def print_version(self, url):
st = time.time()
segundos = str(int(st))
parametros = '.html?t=' + segundos
return url.replace('.html', parametros)
_processed_links = []
def get_article_url(self, article):
# Para obtener la url original del artículo a partir de la de "feedsportal"
link = article.get('link', None)
if link is None:
return article
if link.split('/')[-1]=="story01.htm":
link=link.split('/')[-2]
a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A']
b=['.' ,'/' ,'?' ,'-' ,'=' ,'&' ,'.com','www.','0']
for i in range(0,len(a)):
link=link.replace(a[i],b[i])
link="http://"+link
# Eliminar artículos duplicados en otros feeds
if not (link in self._processed_links):
self._processed_links.append(link)
else:
link = None
return link
# Un poco de css para mejorar la presentación de las noticias
extra_css = '''
.entradilla {font-family:Arial,Helvetica,sans-serif; font-weight:bold; font-style:italic; font-size:16px;}
.fecha_publicacion,.autor {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
'''
# Para presentar la imagen de los videos incrustados
preprocess_regexps = [
(re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
(re.compile(r'var id_reproductor', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
]

View File

@ -11,17 +11,16 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Salon_com(BasicNewsRecipe): class Salon_com(BasicNewsRecipe):
title = 'Salon.com' title = 'Salon.com'
__author__ = 'cix3' __author__ = 'Kovid Goyal'
description = 'Salon.com - Breaking news, opinion, politics, entertainment, sports and culture.' description = 'Salon.com - Breaking news, opinion, politics, entertainment, sports and culture.'
timefmt = ' [%b %d, %Y]' timefmt = ' [%b %d, %Y]'
language = 'en' language = 'en'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
auto_cleanup = True
remove_tags = [dict(name='div', attrs={'class':['ad_content', 'clearfix']}), dict(name='hr'), dict(name='img')] auto_cleanup_keep = '//div[@class="art"]'
remove_empty_feeds = True
remove_tags_before = dict(name='h2')
feeds = [ feeds = [
('News & Politics', 'http://feeds.salon.com/salon/news'), ('News & Politics', 'http://feeds.salon.com/salon/news'),
@ -40,5 +39,5 @@ class Salon_com(BasicNewsRecipe):
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('/index.html', '/print.html') return url + '/print/'

View File

@ -38,9 +38,12 @@
<hr class="cbj_kindle_banner_hr" /> <hr class="cbj_kindle_banner_hr" />
<!-- <!--
In addition you can add code to show the values of custom columns here. In addition you can add code to show the values of custom columns here.
The value is available as _column_name and the title as _column_name_label. The value is available as _column_name and the title as
For example, if you have a custom column with label #genre, you can add it to _column_name_label. For example, if you have a custom column with
this template with _genre_label and _genre. Note that the # is replaced by an underscore. label #genre, you can add it to this template with _genre_label and
_genre. Note that the # is replaced by an underscore. For example
<div><b>{_genre_label}:</b> {_genre}</div>
--> -->
<div class="cbj_comments">{comments}</div> <div class="cbj_comments">{comments}</div>

View File

@ -217,6 +217,7 @@ class ITUNES(DriverBase):
# 0x1297 iPhone 4 # 0x1297 iPhone 4
# 0x129a iPad # 0x129a iPad
# 0x129f iPad2 (WiFi) # 0x129f iPad2 (WiFi)
# 0x12a0 iPhone 4S
# 0x12a2 iPad2 (GSM) # 0x12a2 iPad2 (GSM)
# 0x12a3 iPad2 (CDMA) # 0x12a3 iPad2 (CDMA)
VENDOR_ID = [0x05ac] VENDOR_ID = [0x05ac]
@ -1305,6 +1306,8 @@ class ITUNES(DriverBase):
if DEBUG: if DEBUG:
self.log.info(" ITUNES._add_new_copy()") self.log.info(" ITUNES._add_new_copy()")
self._update_epub_metadata(fpath, metadata)
db_added = None db_added = None
lb_added = None lb_added = None
@ -1409,10 +1412,16 @@ class ITUNES(DriverBase):
tmp_cover.write(cover_data) tmp_cover.write(cover_data)
if lb_added: if lb_added:
if lb_added.Artwork.Count: try:
lb_added.Artwork.Item(1).SetArtworkFromFile(tc) if lb_added.Artwork.Count:
else: lb_added.Artwork.Item(1).SetArtworkFromFile(tc)
lb_added.AddArtworkFromFile(tc) else:
lb_added.AddArtworkFromFile(tc)
except:
if DEBUG:
self.log.warning(" iTunes automation interface reported an error"
" when adding artwork to '%s' in the iTunes Library" % metadata.title)
pass
if db_added: if db_added:
if db_added.Artwork.Count: if db_added.Artwork.Count:
@ -2663,6 +2672,7 @@ class ITUNES(DriverBase):
metadata.timestamp = now() metadata.timestamp = now()
if DEBUG: if DEBUG:
self.log.info(" add timestamp: %s" % metadata.timestamp) self.log.info(" add timestamp: %s" % metadata.timestamp)
else: else:
metadata.timestamp = now() metadata.timestamp = now()
if DEBUG: if DEBUG:
@ -2699,7 +2709,7 @@ class ITUNES(DriverBase):
if iswindows and metadata.series: if iswindows and metadata.series:
metadata.tags = None metadata.tags = None
set_metadata(zfo, metadata, update_timestamp=True) set_metadata(zfo, metadata, apply_null=True, update_timestamp=True)
def _update_device(self, msg='', wait=True): def _update_device(self, msg='', wait=True):
''' '''
@ -2771,6 +2781,8 @@ class ITUNES(DriverBase):
lb_added.sort_name.set(metadata_x.title_sort) lb_added.sort_name.set(metadata_x.title_sort)
if db_added: if db_added:
self.log.warning(" waiting for db_added to become writeable ")
time.sleep(1.0)
db_added.name.set(metadata_x.title) db_added.name.set(metadata_x.title)
db_added.album.set(metadata_x.title) db_added.album.set(metadata_x.title)
db_added.artist.set(authors_to_string(metadata_x.authors)) db_added.artist.set(authors_to_string(metadata_x.authors))

View File

@ -33,7 +33,7 @@ class IREXDR1000(USBMS):
MAIN_MEMORY_VOLUME_LABEL = 'IRex Digital Reader 1000 Main Memory' MAIN_MEMORY_VOLUME_LABEL = 'IRex Digital Reader 1000 Main Memory'
EBOOK_DIR_MAIN = 'ebooks' EBOOK_DIR_MAIN = ''
DELETE_EXTS = ['.mbp'] DELETE_EXTS = ['.mbp']
SUPPORTS_SUB_DIRS = True SUPPORTS_SUB_DIRS = True
@ -44,7 +44,7 @@ class IREXDR800(IREXDR1000):
WINDOWS_MAIN_MEM = 'DR800' WINDOWS_MAIN_MEM = 'DR800'
FORMATS = ['epub', 'pdb', 'html', 'pdf', 'txt'] FORMATS = ['epub', 'pdb', 'html', 'pdf', 'txt']
EBOOK_DIR_MAIN = 'Books' EBOOK_DIR_MAIN = ''
DELETE_EXTS = [] DELETE_EXTS = []
SUPPORTS_SUB_DIRS = True SUPPORTS_SUB_DIRS = True

View File

@ -710,7 +710,8 @@ class Metadata(object):
fmt('Title sort', self.title_sort) fmt('Title sort', self.title_sort)
if self.authors: if self.authors:
fmt('Author(s)', authors_to_string(self.authors) + \ fmt('Author(s)', authors_to_string(self.authors) + \
((' [' + self.author_sort + ']') if self.author_sort else '')) ((' [' + self.author_sort + ']')
if self.author_sort and self.author_sort != _('Unknown') else ''))
if self.publisher: if self.publisher:
fmt('Publisher', self.publisher) fmt('Publisher', self.publisher)
if getattr(self, 'book_producer', False): if getattr(self, 'book_producer', False):

View File

@ -55,6 +55,7 @@ class Ozon(Source):
# div_book -> search only books, ebooks and audio books # div_book -> search only books, ebooks and audio books
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText=' search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
# for ozon.ru search we have to format ISBN with '-'
isbn = _format_isbn(log, identifiers.get('isbn', None)) isbn = _format_isbn(log, identifiers.get('isbn', None))
# TODO: format isbn! # TODO: format isbn!
qItems = set([isbn, title]) qItems = set([isbn, title])
@ -64,7 +65,7 @@ class Ozon(Source):
qItems.discard('') qItems.discard('')
qItems = map(_quoteString, qItems) qItems = map(_quoteString, qItems)
q = ' '.join(qItems).strip() q = u' '.join(qItems).strip()
log.info(u'search string: ' + q) log.info(u'search string: ' + q)
if isinstance(q, unicode): if isinstance(q, unicode):
@ -78,13 +79,13 @@ class Ozon(Source):
return search_url return search_url
# }}} # }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=30): identifiers={}, timeout=30): # {{{
if not self.is_configured(): if not self.is_configured():
return return
query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
if not query: if not query:
err = 'Insufficient metadata to construct query' err = u'Insufficient metadata to construct query'
log.error(err) log.error(err)
return err return err
@ -109,15 +110,15 @@ class Ozon(Source):
# }}} # }}}
def get_metadata(self, log, entries, title, authors, identifiers): # {{{ def get_metadata(self, log, entries, title, authors, identifiers): # {{{
# some book titles have extra charactes like this # some book titles have extra characters like this
# TODO: make a twick # TODO: make a twick
reRemoveFromTitle = None reRemoveFromTitle = None
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
title = unicode(title).upper() if title else '' title = unicode(title).upper() if title else ''
if reRemoveFromTitle: if reRemoveFromTitle:
title = reRemoveFromTitle.sub('', title) title = reRemoveFromTitle.sub('', title)
authors = map(_normalizeAuthorNameWithInitials, authors = map(_normalizeAuthorNameWithInitials,
map(unicode.upper, map(unicode, authors))) if authors else None map(unicode.upper, map(unicode, authors))) if authors else None
ozon_id = identifiers.get('ozon', None) ozon_id = identifiers.get('ozon', None)
@ -160,7 +161,7 @@ class Ozon(Source):
mi.source_relevance = i mi.source_relevance = i
if ensure_metadata_match(mi): if ensure_metadata_match(mi):
metadata.append(mi) metadata.append(mi)
# log.debug(u'added metadata %s %s. '%(mi.title, mi.authors)) #log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
else: else:
log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors)) log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors))
return metadata return metadata
@ -285,12 +286,12 @@ class Ozon(Source):
url = self.get_book_url(metadata.get_identifiers())[2] url = self.get_book_url(metadata.get_identifiers())[2]
raw = self.browser.open_novisit(url, timeout=timeout).read() raw = self.browser.open_novisit(url, timeout=timeout).read()
doc = html.fromstring(raw) doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)' xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")' xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
# series # series Серия/Серии
xpt = xpt_prod_det_at % u'Сери' xpt = xpt_prod_det_at % u'Сери'
# % u'Серия:' # % u'Серия:'
series = doc.xpath(xpt) series = doc.xpath(xpt)
@ -300,7 +301,7 @@ class Ozon(Source):
xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))' xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
isbn_str = doc.xpath(xpt) isbn_str = doc.xpath(xpt)
if isbn_str: if isbn_str:
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)] all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
if all_isbns: if all_isbns:
metadata.all_isbns = all_isbns metadata.all_isbns = all_isbns
metadata.isbn = all_isbns[0] metadata.isbn = all_isbns[0]
@ -319,7 +320,7 @@ class Ozon(Source):
displ_lang = lng_splt[0].strip() displ_lang = lng_splt[0].strip()
metadata.language = _translageLanguageToCode(displ_lang) metadata.language = _translageLanguageToCode(displ_lang)
#log.debug(u'language: %s'%displ_lang) #log.debug(u'language: %s'%displ_lang)
# can be set before from xml search responce # can be set before from xml search responce
if not metadata.pubdate: if not metadata.pubdate:
xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])' xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])'
@ -333,10 +334,10 @@ class Ozon(Source):
xpt = u'//table[@id="detail_description"]//tr/td' xpt = u'//table[@id="detail_description"]//tr/td'
comment_elem = doc.xpath(xpt) comment_elem = doc.xpath(xpt)
if comment_elem: if comment_elem:
comments = unicode(etree.tostring(comment_elem[0])) comments = unicode(etree.tostring(comment_elem[0], encoding=unicode))
if comments: if comments:
# cleanup root tag, TODO: remove tags like object/embeded # cleanup root tag, TODO: remove tags like object/embeded
comments = re.sub(r'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip() comments = re.sub(ur'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
if comments and (not metadata.comments or len(comments) > len(metadata.comments)): if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
metadata.comments = comments metadata.comments = comments
else: else:
@ -345,8 +346,16 @@ class Ozon(Source):
log.debug('No book description found in HTML') log.debug('No book description found in HTML')
# }}} # }}}
def _quoteString(str): # {{{ def _quoteString(strToQuote): # {{{
return '"' + str + '"' if str and str.find(' ') != -1 else str return '"' + strToQuote + '"' if strToQuote and strToQuote.find(' ') != -1 else strToQuote
# }}}
def _verifyISBNIntegrity(log, isbn): # {{{
# Online ISBN-Check http://www.isbn-check.de/
res = check_isbn(isbn)
if not res:
log.error(u'ISBN integrity check failed for "%s"'%isbn)
return res is not None
# }}} # }}}
# TODO: make customizable # TODO: make customizable
@ -425,20 +434,20 @@ def _translageLanguageToCode(displayLang): # {{{
# [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников # [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников
def _normalizeAuthorNameWithInitials(name): # {{{ def _normalizeAuthorNameWithInitials(name): # {{{
res = name res = name
if name: if name:
re1 = u'^(?P<lname>\S+)\s+(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?$' re1 = u'^(?P<lname>\S+)\s+(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?$'
re2 = u'^(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?\s+(?P<lname>\S+)$' re2 = u'^(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?\s+(?P<lname>\S+)$'
matcher = re.match(re1, unicode(name), re.UNICODE) matcher = re.match(re1, unicode(name), re.UNICODE)
if not matcher: if not matcher:
matcher = re.match(re2, unicode(name), re.UNICODE) matcher = re.match(re2, unicode(name), re.UNICODE)
if matcher: if matcher:
d = matcher.groupdict() d = matcher.groupdict()
res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x) res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x)
return res return res
# }}} # }}}
def toPubdate(log, yearAsString): def toPubdate(log, yearAsString): # {{{
res = None res = None
if yearAsString: if yearAsString:
try: try:
@ -448,7 +457,11 @@ def toPubdate(log, yearAsString):
except: except:
log.error('cannot parse to date %s'%yearAsString) log.error('cannot parse to date %s'%yearAsString)
return res return res
# }}}
def _listToUnicodePrintStr(lst): # {{{
return u'[' + u', '.join(unicode(x) for x in lst) + u']'
# }}}
if __name__ == '__main__': # tests {{{ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py

View File

@ -302,7 +302,19 @@ class MobiWriter(object):
def generate_record0(self): # MOBI header {{{ def generate_record0(self): # MOBI header {{{
metadata = self.oeb.metadata metadata = self.oeb.metadata
exth = self.build_exth() bt = 0x002
if self.primary_index_record_idx is not None:
if False and self.indexer.is_flat_periodical:
# Disabled as setting this to 0x102 causes the Kindle to not
# auto archive the issues
bt = 0x102
elif self.indexer.is_periodical:
# If you change this, remember to change the cdetype in the EXTH
# header as well
bt = {'newspaper':0x101}.get(self.publication_type, 0x103)
exth = self.build_exth(bt)
first_image_record = None first_image_record = None
if self.image_records: if self.image_records:
first_image_record = len(self.records) first_image_record = len(self.records)
@ -351,17 +363,6 @@ class MobiWriter(object):
# 0x10 - 0x13 : UID # 0x10 - 0x13 : UID
# 0x14 - 0x17 : Generator version # 0x14 - 0x17 : Generator version
bt = 0x002
if self.primary_index_record_idx is not None:
if False and self.indexer.is_flat_periodical:
# Disabled as setting this to 0x102 causes the Kindle to not
# auto archive the issues
bt = 0x102
elif self.indexer.is_periodical:
# If you change this, remember to change the cdetype in the EXTH
# header as well
bt = {'newspaper':0x101}.get(self.publication_type, 0x103)
record0.write(pack(b'>IIIII', record0.write(pack(b'>IIIII',
0xe8, bt, 65001, uid, 6)) 0xe8, bt, 65001, uid, 6))
@ -479,7 +480,7 @@ class MobiWriter(object):
self.records[0] = align_block(record0) self.records[0] = align_block(record0)
# }}} # }}}
def build_exth(self): # EXTH Header {{{ def build_exth(self, mobi_doctype): # EXTH Header {{{
oeb = self.oeb oeb = self.oeb
exth = StringIO() exth = StringIO()
nrecs = 0 nrecs = 0
@ -535,16 +536,17 @@ class MobiWriter(object):
nrecs += 1 nrecs += 1
# Write cdetype # Write cdetype
if not self.is_periodical and not self.opts.share_not_sync: if not self.is_periodical:
exth.write(pack(b'>II', 501, 12)) if not self.opts.share_not_sync:
exth.write(b'EBOK') exth.write(pack(b'>II', 501, 12))
nrecs += 1 exth.write(b'EBOK')
nrecs += 1
else: else:
# Should be b'NWPR' for doc type of 0x101 and b'MAGZ' for doctype ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None)
# of 0x103 but the old writer didn't write them, and I dont know if ids:
# what it should be for type 0x102 (b'BLOG'?) so write nothing exth.write(pack(b'>II', 501, 12))
# instead exth.write(ids)
pass nrecs += 1
# Add a publication date entry # Add a publication date entry
if oeb.metadata['date']: if oeb.metadata['date']:

View File

@ -16,6 +16,7 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML
from calibre.library.comments import comments_to_html from calibre.library.comments import comments_to_html
from calibre.utils.date import is_date_undefined from calibre.utils.date import is_date_undefined
from calibre.ebooks.chardet import strip_encoding_declarations
JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]' JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]'
@ -175,15 +176,20 @@ def render_jacket(mi, output_profile,
try: try:
display_name, val = mi.format_field_extended(key)[:2] display_name, val = mi.format_field_extended(key)[:2]
key = key.replace('#', '_') key = key.replace('#', '_')
args[key] = val args[key] = escape(val)
args[key+'_label'] = display_name args[key+'_label'] = escape(display_name)
except: except:
pass pass
# Used in the comment describing use of custom columns in templates
args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
args['_genre'] = args.get('_genre', '{_genre}')
generated_html = P('jacket/template.xhtml', generated_html = P('jacket/template.xhtml',
data=True).decode('utf-8').format(**args) data=True).decode('utf-8').format(**args)
# Post-process the generated html to strip out empty header items # Post-process the generated html to strip out empty header items
soup = BeautifulSoup(generated_html) soup = BeautifulSoup(generated_html)
if not series: if not series:
series_tag = soup.find(attrs={'class':'cbj_series'}) series_tag = soup.find(attrs={'class':'cbj_series'})
@ -206,7 +212,8 @@ def render_jacket(mi, output_profile,
if hr_tag is not None: if hr_tag is not None:
hr_tag.extract() hr_tag.extract()
return soup.renderContents(None) return strip_encoding_declarations(
soup.renderContents('utf-8').decode('utf-8'))
from calibre.ebooks.oeb.base import RECOVER_PARSER from calibre.ebooks.oeb.base import RECOVER_PARSER

View File

@ -77,7 +77,8 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
result = False result = False
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read()) raw = xml_to_unicode(f.read(), verbose=True)[0]
doc = html.fromstring(raw)
# example where we are going to find formats # example where we are going to find formats
# <div class="l"> # <div class="l">
@ -88,7 +89,7 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
# <div class="l"> # <div class="l">
# <p>.epub, .fb2.zip, .pdf</p> # <p>.epub, .fb2.zip, .pdf</p>
# </div> # </div>
xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])' xpt = u'normalize-space(//div[contains(@id, "saleBlock")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
formats = doc.xpath(xpt) formats = doc.xpath(xpt)
if formats: if formats:
result = True result = True

View File

@ -12539,7 +12539,7 @@ msgstr "За&грузить метаданные"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:226 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:226
msgid "Configure download metadata" msgid "Configure download metadata"
msgstr "" msgstr "Настроить загрузку метаданных"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:230 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:230
msgid "Change how calibre downloads metadata" msgid "Change how calibre downloads metadata"
@ -12595,7 +12595,7 @@ msgstr "&Пользовательские метаданные"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:788 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:788
msgid "&Comments" msgid "&Comments"
msgstr "Комментарии" msgstr "&Комментарии"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:854 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:854
msgid "Basic metadata" msgid "Basic metadata"
@ -12603,11 +12603,11 @@ msgstr "Основные метаданные"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
msgid "Has cover" msgid "Has cover"
msgstr "Есть обложка" msgstr "Обложка"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
msgid "Has summary" msgid "Has summary"
msgstr "" msgstr "Аннотация"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:190 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:190
msgid "" msgid ""
@ -12619,7 +12619,7 @@ msgstr ""
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:268 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:268
msgid "See at" msgid "See at"
msgstr "" msgstr "Посмотреть на"
#: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:403 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:403
msgid "calibre is downloading metadata from: " msgid "calibre is downloading metadata from: "