mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge from trunk
This commit is contained in:
commit
fdb842b036
BIN
resources/images/news/latimes.png
Normal file
BIN
resources/images/news/latimes.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 358 B |
@ -1,25 +1,25 @@
|
||||
# -*- coding: utf-8
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Luis Hernandez'
|
||||
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
|
||||
description = 'Periódico gratuito en español - v0.8 - 27 Jan 2011'
|
||||
__version__ = 'v0.85'
|
||||
__date__ = '31 January 2011'
|
||||
|
||||
'''
|
||||
www.20minutos.es
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
||||
|
||||
title = u'20 Minutos'
|
||||
title = u'20 Minutos new'
|
||||
publisher = u'Grupo 20 Minutos'
|
||||
|
||||
__author__ = 'Luis Hernández'
|
||||
description = 'Periódico gratuito en español'
|
||||
__author__ = 'Luis Hernandez'
|
||||
description = 'Free spanish newspaper'
|
||||
cover_url = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif'
|
||||
|
||||
oldest_article = 5
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
|
||||
remove_javascript = True
|
||||
@ -29,6 +29,7 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
||||
encoding = 'ISO-8859-1'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
remove_empty_feeds = True
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':['content','vinetas',]})
|
||||
@ -43,13 +44,21 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
||||
remove_tags = [
|
||||
dict(name='ol', attrs={'class':['navigation',]})
|
||||
,dict(name='span', attrs={'class':['action']})
|
||||
,dict(name='div', attrs={'class':['twitter comments-list hidden','related-news','col','photo-gallery','calendario','article-comment','postto estirar','otras_vinetas estirar','kment','user-actions']})
|
||||
,dict(name='div', attrs={'class':['twitter comments-list hidden','related-news','col','photo-gallery','photo-gallery side-art-block','calendario','article-comment','postto estirar','otras_vinetas estirar','kment','user-actions']})
|
||||
,dict(name='div', attrs={'id':['twitter-destacados','eco-tabs','inner','vineta_calendario','vinetistas clearfix','otras_vinetas estirar','MIN1','main','SUP1','INT']})
|
||||
,dict(name='ul', attrs={'class':['article-user-actions','stripped-list']})
|
||||
,dict(name='ul', attrs={'id':['site-links']})
|
||||
,dict(name='li', attrs={'class':['puntuacion','enviar','compartir']})
|
||||
]
|
||||
|
||||
extra_css = """
|
||||
p{text-align: justify; font-size: 100%}
|
||||
body{ text-align: left; font-size:100% }
|
||||
h3{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
|
||||
"""
|
||||
|
||||
preprocess_regexps = [(re.compile(r'<a href="http://estaticos.*?[0-999]px;" target="_blank">', re.DOTALL), lambda m: '')]
|
||||
|
||||
feeds = [
|
||||
(u'Portada' , u'http://www.20minutos.es/rss/')
|
||||
,(u'Nacional' , u'http://www.20minutos.es/rss/nacional/')
|
||||
|
71
resources/recipes/cinco_dias.recipe
Normal file
71
resources/recipes/cinco_dias.recipe
Normal file
@ -0,0 +1,71 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Luis Hernandez'
|
||||
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
|
||||
__version__ = 'v1.2'
|
||||
__date__ = '31 January 2011'
|
||||
|
||||
'''
|
||||
http://www.cincodias.com/
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
||||
|
||||
title = u'Cinco Dias'
|
||||
publisher = u'Grupo Prisa'
|
||||
|
||||
__author__ = 'Luis Hernandez'
|
||||
description = 'spanish web about money and bussiness, free edition'
|
||||
|
||||
cover_url = 'http://www.prisa.com/images/logos/logo_cinco_dias.gif'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
|
||||
language = 'es'
|
||||
remove_empty_feeds = True
|
||||
encoding = 'ISO-8859-1'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['cab_articulo cab_noticia','pos_3','txt_noticia','mod_despiece']})
|
||||
,dict(name='p', attrs={'class':['cintillo']})
|
||||
]
|
||||
|
||||
remove_tags_before = dict(name='div' , attrs={'class':['publi_h']})
|
||||
remove_tags_after = dict(name='div' , attrs={'class':['tab_util util_estadisticas']})
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['util-1','util-2','util-3','inner estirar','inner1','inner2','inner3','cont','tab_util util_estadisticas','tab_util util_enviar','mod_list_inf','mod_similares','mod_divisas','mod_sectores','mod_termometro','mod post','mod_img','mod_txt','nivel estirar','barra estirar','info_brujula btnBrujula','utilidad_brujula estirar']})
|
||||
,dict(name='li', attrs={'class':['lnk-fcbook','lnk-retweet','lnk-meneame','desplegable','comentarios','list-options','estirar']})
|
||||
,dict(name='ul', attrs={'class':['lista-izquierda','list-options','estirar']})
|
||||
,dict(name='p', attrs={'class':['autor']})
|
||||
]
|
||||
|
||||
extra_css = """
|
||||
p{text-align: justify; font-size: 100%}
|
||||
body{ text-align: left; font-size:100% }
|
||||
h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
|
||||
h3{font-family: sans-serif; font-size:100%; font-style: italic; text-align: justify; }
|
||||
"""
|
||||
|
||||
feeds = [
|
||||
(u'Ultima Hora' , u'http://www.cincodias.com/rss/feed.html?feedId=17029')
|
||||
,(u'Empresas' , u'http://www.cincodias.com/rss/feed.html?feedId=19')
|
||||
,(u'Mercados' , u'http://www.cincodias.com/rss/feed.html?feedId=20')
|
||||
,(u'Economia' , u'http://www.cincodias.com/rss/feed.html?feedId=21')
|
||||
,(u'Tecnorama' , u'http://www.cincodias.com/rss/feed.html?feedId=17230')
|
||||
,(u'Tecnologia' , u'http://www.cincodias.com/rss/feed.html?feedId=17106')
|
||||
,(u'Finanzas Personales' , u'http://www.cincodias.com/rss/feed.html?feedId=22')
|
||||
,(u'Fiscalidad' , u'http://www.cincodias.com/rss/feed.html?feedId=17107')
|
||||
,(u'Vivienda' , u'http://www.cincodias.com/rss/feed.html?feedId=17108')
|
||||
,(u'Tendencias' , u'http://www.cincodias.com/rss/feed.html?feedId=17109')
|
||||
,(u'Empleo' , u'http://www.cincodias.com/rss/feed.html?feedId=17110')
|
||||
,(u'IBEX 35' , u'http://www.cincodias.com/rss/feed.html?feedId=17125')
|
||||
,(u'Sectores' , u'http://www.cincodias.com/rss/feed.html?feedId=17126')
|
||||
,(u'Opinion' , u'http://www.cincodias.com/rss/feed.html?feedId=17105')
|
||||
]
|
@ -1,73 +1,92 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
latimes.com
|
||||
www.latimes.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LATimes(BasicNewsRecipe):
|
||||
title = u'The Los Angeles Times'
|
||||
__author__ = u'Darko Miletic and Sujata Raman'
|
||||
description = u'News from Los Angeles'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
language = 'en'
|
||||
title = 'Los Angeles Times'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'The Los Angeles Times is a leading source of news on Southern California, entertainment, movies, television, music, politics, business, health, technology, travel, sports, environment, economics, autos, jobs, real estate and other topics affecting California'
|
||||
publisher = 'Tribune Company'
|
||||
category = 'news, politics, USA, Los Angeles, world'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
encoding = 'utf-8'
|
||||
lang = 'en-US'
|
||||
language = 'en'
|
||||
remove_empty_feeds = True
|
||||
publication_type = 'newspaper'
|
||||
masthead_url = 'http://www.latimes.com/images/logo.png'
|
||||
cover_url = 'http://www.latimes.com/includes/sectionfronts/A1.pdf'
|
||||
extra_css = """
|
||||
body{font-family: Georgia,"Times New Roman",Times,serif }
|
||||
img{margin-bottom: 0.4em; margin-top: 0.8em; display:block}
|
||||
h2{font-size: 1.1em}
|
||||
.deckhead{font-size: small; text-transform: uppercase}
|
||||
.small{color: gray; font-size: small}
|
||||
.date,.time,.copyright{font-size: x-small; color:gray; font-style:italic;}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'language' : lang
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : 'Yes'
|
||||
}
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family :Georgia,"Times New Roman",Times,serif; font-size:large; }
|
||||
h2{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;}
|
||||
.story{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
|
||||
.entry-body{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
|
||||
.entry-more{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
|
||||
.credit{color:#666666; font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
|
||||
.small{color:#666666; font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
|
||||
.byline{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
|
||||
.date{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; font-style:italic;}
|
||||
.time{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; font-style:italic;}
|
||||
.copyright{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; }
|
||||
.subhead{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;}
|
||||
'''
|
||||
|
||||
# recursions = 1
|
||||
# match_regexps = [r'http://www.latimes.com/.*page=[2-9]']
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':["story" ,"entry"] })]
|
||||
|
||||
|
||||
remove_tags = [ dict(name='div', attrs={'class':['articlerail',"sphereTools","tools","toppaginate","entry-footer-left","entry-footer-right"]}),
|
||||
dict(name='div', attrs={'id':["moduleArticleToolsContainer",]}),
|
||||
dict(name='p', attrs={'class':["entry-footer",]}),
|
||||
dict(name='ul', attrs={'class':"article-nav clearfix"}),
|
||||
dict(name=['iframe'])
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'story'})
|
||||
,dict(attrs={'class':['entry-header','time','entry-content']})
|
||||
]
|
||||
|
||||
|
||||
feeds = [(u'News', u'http://feeds.latimes.com/latimes/news')
|
||||
,(u'Local','http://feeds.latimes.com/latimes/news/local')
|
||||
,(u'MostEmailed','http://feeds.latimes.com/MostEmailed')
|
||||
,(u'Politics','http://feeds.latimes.com/latimes/news/local/politics/cal/')
|
||||
,('OrangeCounty','http://feeds.latimes.com/latimes/news/local/orange/')
|
||||
,('National','http://feeds.latimes.com/latimes/news/nationworld/nation')
|
||||
,('Politics','http://feeds.latimes.com/latimes/news/politics/')
|
||||
,('Business','http://feeds.latimes.com/latimes/business')
|
||||
,('Sports','http://feeds.latimes.com/latimes/sports/')
|
||||
,('Entertainment','http://feeds.latimes.com/latimes/entertainment/')
|
||||
remove_tags_after=dict(name='p', attrs={'class':'copyright'})
|
||||
remove_tags = [
|
||||
dict(name=['meta','link','iframe','object','embed'])
|
||||
,dict(attrs={'class':['toolSet','articlerail','googleAd','entry-footer-left','entry-footer-right','entry-footer-social','google-ad-story-bottom','sphereTools']})
|
||||
,dict(attrs={'id':['article-promo','googleads','moduleArticleToolsContainer','gallery-subcontent']})
|
||||
]
|
||||
remove_attributes=['lang','xmlns:fb','xmlns:og','border','xtags','i','article_body']
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Top News' , u'http://feeds.latimes.com/latimes/news' )
|
||||
,(u'Local News' , u'http://feeds.latimes.com/latimes/news/local' )
|
||||
,(u'National' , u'http://feeds.latimes.com/latimes/news/nationworld/nation' )
|
||||
,(u'National Politics' , u'http://feeds.latimes.com/latimes/news/politics/' )
|
||||
,(u'Business' , u'http://feeds.latimes.com/latimes/business' )
|
||||
,(u'Education' , u'http://feeds.latimes.com/latimes/news/education' )
|
||||
,(u'Environment' , u'http://feeds.latimes.com/latimes/news/science/environment' )
|
||||
,(u'Religion' , u'http://feeds.latimes.com/latimes/features/religion' )
|
||||
,(u'Science' , u'http://feeds.latimes.com/latimes/news/science' )
|
||||
,(u'Technology' , u'http://feeds.latimes.com/latimes/technology' )
|
||||
,(u'Africa' , u'http://feeds.latimes.com/latimes/africa' )
|
||||
,(u'Asia' , u'http://feeds.latimes.com/latimes/asia' )
|
||||
,(u'Europe' , u'http://feeds.latimes.com/latimes/europe' )
|
||||
,(u'Latin America' , u'http://feeds.latimes.com/latimes/latinamerica' )
|
||||
,(u'Middle East' , u'http://feeds.latimes.com/latimes/middleeast' )
|
||||
,(u'Arts&Culture' , u'http://feeds.feedburner.com/latimes/entertainment/news/arts' )
|
||||
,(u'Entertainment News' , u'http://feeds.feedburner.com/latimes/entertainment/news/' )
|
||||
,(u'Movie News' , u'http://feeds.feedburner.com/latimes/entertainment/news/movies/' )
|
||||
,(u'Movie Reviews' , u'http://feeds.feedburner.com/movies/reviews/' )
|
||||
,(u'Music News' , u'http://feeds.feedburner.com/latimes/entertainment/news/music/' )
|
||||
,(u'Pop Album Reviews' , u'http://feeds.feedburner.com/latimes/pop-album-reviews' )
|
||||
,(u'Restaurant Reviews' , u'http://feeds.feedburner.com/latimes/restaurant/reviews' )
|
||||
,(u'Theatar and Dance' , u'http://feeds.feedburner.com/latimes/theaterdance' )
|
||||
,(u'Autos' , u'http://feeds.latimes.com/latimes/classified/automotive/highway1/')
|
||||
,(u'Books' , u'http://feeds.latimes.com/features/books' )
|
||||
,(u'Food' , u'http://feeds.latimes.com/latimes/features/food/' )
|
||||
,(u'Health' , u'http://feeds.latimes.com/latimes/features/health/' )
|
||||
,(u'Real Estate' , u'http://feeds.latimes.com/latimes/classified/realestate/' )
|
||||
,(u'Commentary' , u'http://feeds2.feedburner.com/latimes/news/opinion/commentary/' )
|
||||
,(u'Sports' , u'http://feeds.latimes.com/latimes/sports/' )
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
ans = article.get('feedburner_origlink').rpartition('?')[0]
|
||||
ans = BasicNewsRecipe.get_article_url(self, article).rpartition('?')[0]
|
||||
|
||||
try:
|
||||
self.log('Looking for full story link in', ans)
|
||||
@ -83,4 +102,22 @@ class LATimes(BasicNewsRecipe):
|
||||
pass
|
||||
return ans
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('img'):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name ='div'
|
||||
item.attrs =[]
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
@ -139,6 +139,13 @@ class CHMReader(CHMFile):
|
||||
if self.hhc_path not in files and files:
|
||||
self.hhc_path = files[0]
|
||||
|
||||
if self.hhc_path == '.hhc' and self.hhc_path not in files:
|
||||
from calibre import walk
|
||||
for x in walk(output_dir):
|
||||
if os.path.basename(x).lower() in ('index.htm', 'index.html'):
|
||||
self.hhc_path = os.path.relpath(x, output_dir)
|
||||
break
|
||||
|
||||
def _reformat(self, data, htmlpath):
|
||||
try:
|
||||
data = xml_to_unicode(data, strip_encoding_pats=True)[0]
|
||||
|
@ -175,6 +175,19 @@ class EPUBInput(InputFormatPlugin):
|
||||
raise ValueError(
|
||||
'EPUB files with DTBook markup are not supported')
|
||||
|
||||
for x in list(opf.iterspine()):
|
||||
ref = x.get('idref', None)
|
||||
if ref is None:
|
||||
x.getparent().remove(x)
|
||||
continue
|
||||
for y in opf.itermanifest():
|
||||
if y.get('id', None) == ref and y.get('media-type', None) in \
|
||||
('application/vnd.adobe-page-template+xml',):
|
||||
p = x.getparent()
|
||||
if p is not None:
|
||||
p.remove(x)
|
||||
break
|
||||
|
||||
with open('content.opf', 'wb') as nopf:
|
||||
nopf.write(opf.render())
|
||||
|
||||
|
61
src/calibre/ebooks/metadata/sources/base.py
Normal file
61
src/calibre/ebooks/metadata/sources/base.py
Normal file
@ -0,0 +1,61 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from calibre.customize import Plugin
|
||||
|
||||
class Source(Plugin):
|
||||
|
||||
type = _('Metadata source')
|
||||
author = 'Kovid Goyal'
|
||||
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
|
||||
result_of_identify_is_complete = True
|
||||
|
||||
def get_author_tokens(self, authors):
|
||||
'Take a list of authors and return a list of tokens useful for a '
|
||||
'AND search query'
|
||||
# Leave ' in there for Irish names
|
||||
pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
|
||||
for au in authors:
|
||||
for tok in au.split():
|
||||
yield pat.sub('', tok)
|
||||
|
||||
def split_jobs(self, jobs, num):
|
||||
'Split a list of jobs into at most num groups, as evenly as possible'
|
||||
groups = [[] for i in range(num)]
|
||||
jobs = list(jobs)
|
||||
while jobs:
|
||||
for gr in groups:
|
||||
try:
|
||||
job = jobs.pop()
|
||||
except IndexError:
|
||||
break
|
||||
gr.append(job)
|
||||
return [g for g in groups if g]
|
||||
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
|
||||
'''
|
||||
Identify a book by its title/author/isbn/etc.
|
||||
|
||||
:param log: A log object, use it to output debugging information/errors
|
||||
:param result_queue: A result Queue, results should be put into it.
|
||||
Each result is a Metadata object
|
||||
:param abort: If abort.is_set() returns True, abort further processing
|
||||
and return as soon as possible
|
||||
:param title: The title of the book, can be None
|
||||
:param authors: A list of authors of the book, can be None
|
||||
:param identifiers: A dictionary of other identifiers, most commonly
|
||||
{'isbn':'1234...'}
|
||||
:return: None if no errors occurred, otherwise a unicode representation
|
||||
of the error suitable for showing to the user
|
||||
|
||||
'''
|
||||
return None
|
||||
|
215
src/calibre/ebooks/metadata/sources/google.py
Normal file
215
src/calibre/ebooks/metadata/sources/google.py
Normal file
@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import time
|
||||
from urllib import urlencode
|
||||
from functools import partial
|
||||
from threading import Thread
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.metadata.sources import Source
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.utils.date import parse_date, utcnow
|
||||
from calibre import browser, as_unicode
|
||||
|
||||
NAMESPACES = {
|
||||
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
|
||||
'atom' : 'http://www.w3.org/2005/Atom',
|
||||
'dc': 'http://purl.org/dc/terms'
|
||||
}
|
||||
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
||||
|
||||
total_results = XPath('//openSearch:totalResults')
|
||||
start_index = XPath('//openSearch:startIndex')
|
||||
items_per_page = XPath('//openSearch:itemsPerPage')
|
||||
entry = XPath('//atom:entry')
|
||||
entry_id = XPath('descendant::atom:id')
|
||||
creator = XPath('descendant::dc:creator')
|
||||
identifier = XPath('descendant::dc:identifier')
|
||||
title = XPath('descendant::dc:title')
|
||||
date = XPath('descendant::dc:date')
|
||||
publisher = XPath('descendant::dc:publisher')
|
||||
subject = XPath('descendant::dc:subject')
|
||||
description = XPath('descendant::dc:description')
|
||||
language = XPath('descendant::dc:language')
|
||||
|
||||
|
||||
|
||||
def to_metadata(browser, log, entry_):
|
||||
|
||||
def get_text(extra, x):
|
||||
try:
|
||||
ans = x(extra)
|
||||
if ans:
|
||||
ans = ans[0].text
|
||||
if ans and ans.strip():
|
||||
return ans.strip()
|
||||
except:
|
||||
log.exception('Programming error:')
|
||||
return None
|
||||
|
||||
|
||||
id_url = entry_id(entry_)[0].text
|
||||
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
||||
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
||||
if not authors:
|
||||
authors = [_('Unknown')]
|
||||
if not id_url or not title:
|
||||
# Silently discard this entry
|
||||
return None
|
||||
|
||||
mi = Metadata(title_, authors)
|
||||
try:
|
||||
raw = browser.open(id_url).read()
|
||||
feed = etree.fromstring(raw)
|
||||
extra = entry(feed)[0]
|
||||
except:
|
||||
log.exception('Failed to get additional details for', mi.title)
|
||||
return mi
|
||||
|
||||
mi.comments = get_text(extra, description)
|
||||
#mi.language = get_text(extra, language)
|
||||
mi.publisher = get_text(extra, publisher)
|
||||
|
||||
# Author sort
|
||||
for x in creator(extra):
|
||||
for key, val in x.attrib.items():
|
||||
if key.endswith('file-as') and val and val.strip():
|
||||
mi.author_sort = val
|
||||
break
|
||||
# ISBN
|
||||
isbns = []
|
||||
for x in identifier(extra):
|
||||
t = str(x.text).strip()
|
||||
if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
|
||||
if t[:5].upper() == 'ISBN:':
|
||||
isbns.append(t[5:])
|
||||
if isbns:
|
||||
mi.isbn = sorted(isbns, key=len)[-1]
|
||||
|
||||
# Tags
|
||||
try:
|
||||
btags = [x.text for x in subject(extra) if x.text]
|
||||
tags = []
|
||||
for t in btags:
|
||||
tags.extend([y.strip() for y in t.split('/')])
|
||||
tags = list(sorted(list(set(tags))))
|
||||
except:
|
||||
log.exception('Failed to parse tags:')
|
||||
tags = []
|
||||
if tags:
|
||||
mi.tags = [x.replace(',', ';') for x in tags]
|
||||
|
||||
# pubdate
|
||||
pubdate = get_text(extra, date)
|
||||
if pubdate:
|
||||
try:
|
||||
default = utcnow().replace(day=15)
|
||||
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
|
||||
except:
|
||||
log.exception('Failed to parse pubdate')
|
||||
|
||||
|
||||
return mi
|
||||
|
||||
class Worker(Thread):
|
||||
|
||||
def __init__(self, log, entries, abort, result_queue):
|
||||
self.browser, self.log, self.entries = browser(), log, entries
|
||||
self.abort, self.result_queue = abort, result_queue
|
||||
Thread.__init__(self)
|
||||
self.daemon = True
|
||||
|
||||
def run(self):
|
||||
for i in self.entries:
|
||||
try:
|
||||
ans = to_metadata(self.browser, self.log, i)
|
||||
if ans is not None:
|
||||
self.result_queue.put(ans)
|
||||
except:
|
||||
self.log.exception(
|
||||
'Failed to get metadata for identify entry:',
|
||||
etree.tostring(i))
|
||||
if self.abort.is_set():
|
||||
break
|
||||
|
||||
|
||||
class GoogleBooks(Source):
|
||||
|
||||
name = 'Google Books'
|
||||
|
||||
def create_query(self, log, title=None, authors=None, identifiers={},
|
||||
start_index=1):
|
||||
BASE_URL = 'http://books.google.com/books/feeds/volumes?'
|
||||
isbn = identifiers.get('isbn', None)
|
||||
q = ''
|
||||
if isbn is not None:
|
||||
q += 'isbn:'+isbn
|
||||
elif title or authors:
|
||||
def build_term(prefix, parts):
|
||||
return ' '.join('in'+prefix + ':' + x for x in parts)
|
||||
if title is not None:
|
||||
q += build_term('title', title.split())
|
||||
if authors:
|
||||
q += ('+' if q else '')+build_term('author',
|
||||
self.get_author_tokens(authors))
|
||||
|
||||
if isinstance(q, unicode):
|
||||
q = q.encode('utf-8')
|
||||
if not q:
|
||||
return None
|
||||
return BASE_URL+urlencode({
|
||||
'q':q,
|
||||
'max-results':20,
|
||||
'start-index':start_index,
|
||||
'min-viewability':'none',
|
||||
})
|
||||
|
||||
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
|
||||
query = self.create_query(log, title=title, authors=authors,
|
||||
identifiers=identifiers)
|
||||
try:
|
||||
raw = browser().open_novisit(query).read()
|
||||
except Exception, e:
|
||||
log.exception('Failed to make identify query: %r'%query)
|
||||
return as_unicode(e)
|
||||
|
||||
try:
|
||||
parser = etree.XMLParser(recover=True, no_network=True)
|
||||
feed = etree.fromstring(raw, parser=parser)
|
||||
entries = entry(feed)
|
||||
except Exception, e:
|
||||
log.exception('Failed to parse identify results')
|
||||
return as_unicode(e)
|
||||
|
||||
|
||||
groups = self.split_jobs(entries, 5) # At most 5 threads
|
||||
if not groups:
|
||||
return
|
||||
workers = [Worker(log, entries, abort, result_queue) for entries in
|
||||
groups]
|
||||
|
||||
if abort.is_set():
|
||||
return
|
||||
|
||||
for worker in workers: worker.start()
|
||||
|
||||
has_alive_worker = True
|
||||
while has_alive_worker and not abort.is_set():
|
||||
has_alive_worker = False
|
||||
for worker in workers:
|
||||
if worker.is_alive():
|
||||
has_alive_worker = True
|
||||
time.sleep(0.1)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
|
@ -83,6 +83,7 @@ class RTFInput(InputFormatPlugin):
|
||||
os.mkdir(debug_dir)
|
||||
debug_dir = 'rtfdebug'
|
||||
run_lev = 4
|
||||
self.log('Running RTFParser in debug mode')
|
||||
except:
|
||||
pass
|
||||
parser = ParseRtf(
|
||||
@ -230,22 +231,6 @@ class RTFInput(InputFormatPlugin):
|
||||
with open('styles.css', 'ab') as f:
|
||||
f.write(css)
|
||||
|
||||
# def preprocess(self, fname):
|
||||
# self.log('\tPreprocessing to convert unicode characters')
|
||||
# try:
|
||||
# data = open(fname, 'rb').read()
|
||||
# from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
||||
# tokenizer = RtfTokenizer(data)
|
||||
# tokens = RtfTokenParser(tokenizer.tokens)
|
||||
# data = tokens.toRTF()
|
||||
# fname = 'preprocessed.rtf'
|
||||
# with open(fname, 'wb') as f:
|
||||
# f.write(data)
|
||||
# except:
|
||||
# self.log.exception(
|
||||
# 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
||||
# return fname
|
||||
|
||||
def convert_borders(self, doc):
|
||||
border_styles = []
|
||||
style_map = {}
|
||||
@ -280,8 +265,6 @@ class RTFInput(InputFormatPlugin):
|
||||
self.opts = options
|
||||
self.log = log
|
||||
self.log('Converting RTF to XML...')
|
||||
#Name of the preprocesssed RTF file
|
||||
# fname = self.preprocess(stream.name)
|
||||
try:
|
||||
xml = self.generate_xml(stream.name)
|
||||
except RtfInvalidCodeException, e:
|
||||
@ -335,3 +318,4 @@ class RTFInput(InputFormatPlugin):
|
||||
opf.render(open('metadata.opf', 'wb'))
|
||||
return os.path.abspath('metadata.opf')
|
||||
|
||||
|
||||
|
@ -238,6 +238,8 @@ class ParseRtf:
|
||||
bug_handler = RtfInvalidCodeException,
|
||||
)
|
||||
enc = 'cp' + encode_obj.get_codepage()
|
||||
if enc == 'cp10000':
|
||||
enc = 'mac_roman'
|
||||
msg = 'Exception in token processing'
|
||||
if check_encoding_obj.check_encoding(self.__file, enc):
|
||||
file_name = self.__file if isinstance(self.__file, str) \
|
||||
|
@ -16,7 +16,9 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile, re
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
|
||||
class Colors:
|
||||
"""
|
||||
Change lines with color info from color numbers to the actual color names.
|
||||
@ -40,8 +42,10 @@ class Colors:
|
||||
self.__file = in_file
|
||||
self.__copy = copy
|
||||
self.__bug_handler = bug_handler
|
||||
self.__line = 0
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
@ -61,6 +65,7 @@ class Colors:
|
||||
self.__color_num = 1
|
||||
self.__line_color_exp = re.compile(r'bdr-color_:(\d+)')
|
||||
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
|
||||
|
||||
def __before_color_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -76,6 +81,7 @@ class Colors:
|
||||
if self.__token_info == 'mi<mk<clrtbl-beg':
|
||||
self.__state = 'in_color_table'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __default_color_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -87,6 +93,7 @@ class Colors:
|
||||
"""
|
||||
hex_num = line[-3:-1]
|
||||
self.__color_string += hex_num
|
||||
|
||||
def __blue_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -109,6 +116,7 @@ class Colors:
|
||||
)
|
||||
self.__color_num += 1
|
||||
self.__color_string = '#'
|
||||
|
||||
def __in_color_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -127,12 +135,13 @@ class Colors:
|
||||
self.__state = 'after_color_table'
|
||||
else:
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action == None:
|
||||
if action is None:
|
||||
sys.stderr.write('in module colors.py\n'
|
||||
'function is self.__in_color_func\n'
|
||||
'no action for %s' % self.__token_info
|
||||
)
|
||||
action(line)
|
||||
|
||||
def __after_color_func(self, line):
|
||||
"""
|
||||
Check the to see if it contains color info. If it does, extract the
|
||||
@ -180,6 +189,7 @@ class Colors:
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
|
||||
|
||||
def __sub_from_line_color(self, match_obj):
|
||||
num = match_obj.group(1)
|
||||
try:
|
||||
@ -191,25 +201,27 @@ class Colors:
|
||||
else:
|
||||
return 'bdr-color_:no-value'
|
||||
hex_num = self.__figure_num(num)
|
||||
return_value = 'bdr-color_:%s' % hex_num
|
||||
return return_value
|
||||
return 'bdr-color_:%s' % hex_num
|
||||
|
||||
def __figure_num(self, num):
|
||||
if num == 0:
|
||||
hex_num = 'false'
|
||||
else:
|
||||
hex_num = self.__color_dict.get(num)
|
||||
if hex_num == None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no value in self.__color_dict for key %s\n' % num
|
||||
raise self.__bug_hanlder, msg
|
||||
if hex_num == None:
|
||||
if hex_num is None:
|
||||
hex_num = '0'
|
||||
if self.__run_level > 5:
|
||||
msg = 'no value in self.__color_dict' \
|
||||
'for key %s at line %d\n' % (num, self.__line)
|
||||
raise self.__bug_handler, msg
|
||||
return hex_num
|
||||
|
||||
def __do_nothing_func(self, line):
|
||||
"""
|
||||
Bad RTF will have text in the color table
|
||||
"""
|
||||
pass
|
||||
|
||||
def convert_colors(self):
|
||||
"""
|
||||
Requires:
|
||||
@ -226,20 +238,16 @@ class Colors:
|
||||
info, and substitute the number with the hex number.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__line+=1
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module fonts.py\n')
|
||||
if action is None:
|
||||
sys.stderr.write('no matching state in module fonts.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "color.data")
|
||||
|
@ -33,13 +33,13 @@ class ConvertToTags:
|
||||
self.__copy = copy
|
||||
self.__dtd_path = dtd_path
|
||||
self.__no_dtd = no_dtd
|
||||
if encoding != 'mac_roman':
|
||||
self.__encoding = 'cp' + encoding
|
||||
else:
|
||||
if encoding == 'mac_roman':
|
||||
self.__encoding = 'mac_roman'
|
||||
self.__indent = indent
|
||||
self.__run_level = run_level
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__convert_utf = False
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
@ -213,7 +213,8 @@ class ConvertToTags:
|
||||
if not check_encoding_obj.check_encoding(self.__file, verbose=False):
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding)
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
|
||||
self.__convert_utf = True
|
||||
else:
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||
sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
|
||||
@ -253,7 +254,7 @@ class ConvertToTags:
|
||||
an empty tag function.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
self.__write_dec()
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
for line in read_obj:
|
||||
@ -262,6 +263,19 @@ class ConvertToTags:
|
||||
if action is not None:
|
||||
action(line)
|
||||
self.__write_obj.close()
|
||||
#convert all encodings to UTF8 to avoid unsupported encodings in lxml
|
||||
if self.__convert_utf:
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as write_obj:
|
||||
file = read_obj.read()
|
||||
try:
|
||||
file = file.decode(self.__encoding)
|
||||
write_obj.write(file.encode('utf-8'))
|
||||
except:
|
||||
sys.stderr.write('Conversion to UTF-8 is not possible,'
|
||||
' encoding should be very carefully checked')
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
|
||||
|
@ -75,12 +75,16 @@ class DefaultEncoding:
|
||||
self._encoding()
|
||||
self.__datafetched = True
|
||||
code_page = 'ansicpg' + self.__code_page
|
||||
if self.__code_page == '10000':
|
||||
self.__code_page = 'mac_roman'
|
||||
return self.__platform, code_page, self.__default_num
|
||||
|
||||
def get_codepage(self):
|
||||
if not self.__datafetched:
|
||||
self._encoding()
|
||||
self.__datafetched = True
|
||||
if self.__code_page == '10000':
|
||||
self.__code_page = 'mac_roman'
|
||||
return self.__code_page
|
||||
|
||||
def get_platform(self):
|
||||
|
@ -16,7 +16,9 @@
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, tempfile
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
|
||||
class Fonts:
|
||||
"""
|
||||
Change lines with font info from font numbers to the actual font names.
|
||||
@ -45,6 +47,7 @@ class Fonts:
|
||||
self.__default_font_num = default_font_num
|
||||
self.__write_to = tempfile.mktemp()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
@ -67,6 +70,7 @@ class Fonts:
|
||||
self.__font_table = {}
|
||||
# individual font written
|
||||
self.__wrote_ind_font = 0
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -79,6 +83,7 @@ class Fonts:
|
||||
if self.__token_info == 'mi<mk<fonttb-beg':
|
||||
self.__state = 'font_table'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __font_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -101,6 +106,7 @@ class Fonts:
|
||||
self.__font_num = self.__default_font_num
|
||||
self.__text_line = ''
|
||||
##self.__write_obj.write(line)
|
||||
|
||||
def __font_in_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
@ -138,6 +144,7 @@ class Fonts:
|
||||
elif self.__token_info == 'mi<mk<fonttb-end':
|
||||
self.__found_end_font_table_func()
|
||||
self.__state = 'after_font_table'
|
||||
|
||||
def __found_end_font_table_func(self):
|
||||
"""
|
||||
Required:
|
||||
@ -150,7 +157,8 @@ class Fonts:
|
||||
if not self.__wrote_ind_font:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_'
|
||||
'<font-in-table<name>Times<num>0\n' )
|
||||
'<font-in-table<name>Times<num>0\n')
|
||||
|
||||
def __after_font_table_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
@ -169,7 +177,7 @@ class Fonts:
|
||||
if self.__token_info == 'cw<ci<font-style':
|
||||
font_num = line[20:-1]
|
||||
font_name = self.__font_table.get(font_num)
|
||||
if font_name == None:
|
||||
if font_name is None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no value for %s in self.__font_table\n' % font_num
|
||||
raise self.__bug_handler, msg
|
||||
@ -182,6 +190,7 @@ class Fonts:
|
||||
)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def convert_fonts(self):
|
||||
"""
|
||||
Required:
|
||||
@ -197,20 +206,15 @@ class Fonts:
|
||||
info. Substitute a font name for a font number.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open(self.__file, 'r')
|
||||
self.__write_obj = open(self.__write_to, 'w')
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
with open(self.__file, 'r') as read_obj:
|
||||
with open(self.__write_to, 'w') as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action == None:
|
||||
sys.stderr.write('no no matching state in module fonts.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
if action is None:
|
||||
sys.stderr.write('no matching state in module fonts.py\n' \
|
||||
+ self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
default_font_name = self.__font_table.get(self.__default_font_num)
|
||||
if not default_font_name:
|
||||
default_font_name = 'Not Defined'
|
||||
|
@ -43,7 +43,7 @@ class GetCharMap:
|
||||
def get_char_map(self, map):
|
||||
if map == 'ansicpg0':
|
||||
map = 'ansicpg1250'
|
||||
if map in ('ansicpg10000', '10000'):
|
||||
if map == 'ansicpg10000':
|
||||
map = 'mac_roman'
|
||||
found_map = False
|
||||
map_dict = {}
|
||||
|
@ -126,12 +126,6 @@ class Tokenize:
|
||||
tokens = re.split(self.__splitexp, input_file)
|
||||
#remove empty tokens and \n
|
||||
return filter(lambda x: len(x) > 0 and x != '\n', tokens)
|
||||
#input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
|
||||
# line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
|
||||
# this is for older RTF
|
||||
#line = re.sub(self.__par_exp, '\\par ', line)
|
||||
#return filter(lambda x: len(x) > 0, \
|
||||
#(self.__remove_line.sub('', x) for x in tokens))
|
||||
|
||||
def __compile_expressions(self):
|
||||
SIMPLE_RPL = {
|
||||
@ -160,7 +154,7 @@ class Tokenize:
|
||||
}
|
||||
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
||||
#add ;? in case of char following \u
|
||||
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
|
||||
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})")
|
||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
|
||||
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
|
||||
#manage upr/ud situations
|
||||
@ -172,14 +166,21 @@ class Tokenize:
|
||||
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
||||
#this is for old RTF
|
||||
self.__par_exp = re.compile(r'\\\n+')
|
||||
# self.__par_exp = re.compile(r'\\$')
|
||||
#handle cw using a digit as argument and without space as delimiter
|
||||
self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)")
|
||||
#self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
|
||||
#self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
|
||||
#self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
|
||||
#self.__remove_line = re.compile(r'\n+')
|
||||
#self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
|
||||
##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
|
||||
|
||||
def __correct_spliting(self, token):
|
||||
match_obj = re.search(self.__cwdigit_exp, token)
|
||||
if match_obj is None:
|
||||
return token
|
||||
else:
|
||||
return '%s\n%s' % (match_obj.group(1), match_obj.group(2))
|
||||
|
||||
def tokenize(self):
|
||||
"""Main class for handling other methods. Reads the file \
|
||||
, uses method self.sub_reg to make basic substitutions,\
|
||||
@ -195,6 +196,8 @@ class Tokenize:
|
||||
tokens = map(self.__unicode_process, tokens)
|
||||
#remove empty items created by removing \uc
|
||||
tokens = filter(lambda x: len(x) > 0, tokens)
|
||||
#handles bothersome cases
|
||||
tokens = map(self.__correct_spliting, tokens)
|
||||
|
||||
#write
|
||||
with open(self.__write_to, 'wb') as write_obj:
|
||||
@ -203,8 +206,6 @@ class Tokenize:
|
||||
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
||||
# if self.__out_file:
|
||||
# self.__file = self.__out_file
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
|
@ -429,10 +429,12 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
|
||||
old_extensions.add(ext)
|
||||
for ext in new_extensions:
|
||||
self.db.add_format(self.row, ext, open(paths[ext], 'rb'), notify=False)
|
||||
db_extensions = set([f.lower() for f in self.db.formats(self.row).split(',')])
|
||||
dbfmts = self.db.formats(self.row)
|
||||
db_extensions = set([f.lower() for f in (dbfmts.split(',') if dbfmts
|
||||
else [])])
|
||||
extensions = new_extensions.union(old_extensions)
|
||||
for ext in db_extensions:
|
||||
if ext not in extensions:
|
||||
if ext not in extensions and ext in self.original_formats:
|
||||
self.db.remove_format(self.row, ext, notify=False)
|
||||
|
||||
def show_format(self, item, *args):
|
||||
@ -576,6 +578,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
|
||||
self.orig_date = qt_to_dt(self.date.date())
|
||||
|
||||
exts = self.db.formats(row)
|
||||
self.original_formats = []
|
||||
if exts:
|
||||
exts = exts.split(',')
|
||||
for ext in exts:
|
||||
@ -586,6 +589,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
|
||||
if size is None:
|
||||
continue
|
||||
Format(self.formats, ext, size, timestamp=timestamp)
|
||||
self.original_formats.append(ext.lower())
|
||||
|
||||
|
||||
self.initialize_combos()
|
||||
|
@ -472,6 +472,7 @@ class FormatsManager(QWidget): # {{{
|
||||
def initialize(self, db, id_):
|
||||
self.changed = False
|
||||
exts = db.formats(id_, index_is_id=True)
|
||||
self.original_val = set([])
|
||||
if exts:
|
||||
exts = exts.split(',')
|
||||
for ext in exts:
|
||||
@ -482,6 +483,7 @@ class FormatsManager(QWidget): # {{{
|
||||
if size is None:
|
||||
continue
|
||||
Format(self.formats, ext, size, timestamp=timestamp)
|
||||
self.original_val.add(ext.lower())
|
||||
|
||||
def commit(self, db, id_):
|
||||
if not self.changed:
|
||||
@ -500,11 +502,12 @@ class FormatsManager(QWidget): # {{{
|
||||
for ext in new_extensions:
|
||||
db.add_format(id_, ext, open(paths[ext], 'rb'), notify=False,
|
||||
index_is_id=True)
|
||||
db_extensions = set([f.lower() for f in db.formats(id_,
|
||||
index_is_id=True).split(',')])
|
||||
dbfmts = db.formats(id_, index_is_id=True)
|
||||
db_extensions = set([f.lower() for f in (dbfmts.split(',') if dbfmts
|
||||
else [])])
|
||||
extensions = new_extensions.union(old_extensions)
|
||||
for ext in db_extensions:
|
||||
if ext not in extensions:
|
||||
if ext not in extensions and ext in self.original_val:
|
||||
db.remove_format(id_, ext, notify=False, index_is_id=True)
|
||||
|
||||
self.changed = False
|
||||
|
@ -232,6 +232,7 @@ class BIBTEX(CatalogPlugin): # {{{
|
||||
help = _('The fields to output when cataloging books in the '
|
||||
'database. Should be a comma-separated list of fields.\n'
|
||||
'Available fields: %s.\n'
|
||||
'plus user-created custom fields.\n'
|
||||
'Example: %s=title,authors,tags\n'
|
||||
"Default: '%%default'\n"
|
||||
"Applies to: BIBTEX output format")%(', '.join(FIELDS),
|
||||
@ -269,7 +270,7 @@ class BIBTEX(CatalogPlugin): # {{{
|
||||
dest = 'bib_cit',
|
||||
action = None,
|
||||
help = _('The template for citation creation from database fields.\n'
|
||||
' Should be a template with {} enclosed fields.\n'
|
||||
'Should be a template with {} enclosed fields.\n'
|
||||
'Available fields: %s.\n'
|
||||
"Default: '%%default'\n"
|
||||
"Applies to: BIBTEX output format")%', '.join(TEMPLATE_ALLOWED_FIELDS)),
|
||||
@ -344,7 +345,7 @@ class BIBTEX(CatalogPlugin): # {{{
|
||||
if field == 'authors' :
|
||||
bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item))
|
||||
|
||||
elif field in ['title', 'publisher', 'cover', 'uuid',
|
||||
elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice',
|
||||
'author_sort', 'series'] :
|
||||
bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item)))
|
||||
|
||||
@ -378,7 +379,7 @@ class BIBTEX(CatalogPlugin): # {{{
|
||||
if calibre_files:
|
||||
files = [u':%s:%s' % (format, format.rpartition('.')[2].upper())\
|
||||
for format in item]
|
||||
bibtex_entry.append(u'files = "%s"' % u', '.join(files))
|
||||
bibtex_entry.append(u'file = "%s"' % u', '.join(files))
|
||||
|
||||
elif field == 'series_index' :
|
||||
bibtex_entry.append(u'volume = "%s"' % int(item))
|
||||
@ -474,6 +475,8 @@ class BIBTEX(CatalogPlugin): # {{{
|
||||
if opts.verbose:
|
||||
opts_dict = vars(opts)
|
||||
log("%s(): Generating %s" % (self.name,self.fmt))
|
||||
if opts.connected_device['is_device_connected']:
|
||||
log(" connected_device: %s" % opts.connected_device['name'])
|
||||
if opts_dict['search_text']:
|
||||
log(" --search='%s'" % opts_dict['search_text'])
|
||||
|
||||
@ -548,6 +551,7 @@ class BIBTEX(CatalogPlugin): # {{{
|
||||
as outfile:
|
||||
#File header
|
||||
nb_entries = len(data)
|
||||
|
||||
#check in book strict if all is ok else throw a warning into log
|
||||
if bib_entry == 'book' :
|
||||
nb_books = len(filter(check_entry_book_valid, data))
|
||||
@ -555,6 +559,11 @@ class BIBTEX(CatalogPlugin): # {{{
|
||||
log(" WARNING: only %d entries in %d are book compatible" % (nb_books, nb_entries))
|
||||
nb_entries = nb_books
|
||||
|
||||
# If connected device, add 'On Device' values to data
|
||||
if opts.connected_device['is_device_connected'] and 'ondevice' in fields:
|
||||
for entry in data:
|
||||
entry['ondevice'] = db.catalog_plugin_on_device_temp_mapping[entry['id']]['ondevice']
|
||||
|
||||
outfile.write(u'%%%Calibre catalog\n%%%{0} entries in catalog\n\n'.format(nb_entries))
|
||||
outfile.write(u'@preamble{"This catalog of %d entries was generated by calibre on %s"}\n\n'
|
||||
% (nb_entries, nowf().strftime("%A, %d. %B %Y %H:%M").decode(preferred_encoding)))
|
||||
|
@ -391,6 +391,8 @@ Take your pick:
|
||||
* A tribute to the SONY Librie which was the first e-ink based e-book reader
|
||||
* My wife chose it ;-)
|
||||
|
||||
|app| is pronounced as cal-i-ber *not* ca-libre. If you're wondering, |app| is the British/commonwealth spelling for caliber. Being Indian, that's the natural spelling for me.
|
||||
|
||||
Why does |app| show only some of my fonts on OS X?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|app| embeds fonts in ebook files it creates. E-book files support embedding only TrueType (.ttf) fonts. Most fonts on OS X systems are in .dfont format, thus they cannot be embedded. |app| shows only TrueType fonts found on your system. You can obtain many TrueType fonts on the web. Simply download the .ttf files and add them to the Library/Fonts directory in your home directory.
|
||||
|
Loading…
x
Reference in New Issue
Block a user