From a300879f7055ed0c9ddc5cbe44484fdfc940f05c Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 19 Mar 2011 15:32:05 -0400 Subject: [PATCH 01/26] TXT Input: Texttile: Simplify code for handing macros and glyphs. --- src/calibre/ebooks/textile/functions.py | 408 +++++++----------------- 1 file changed, 114 insertions(+), 294 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index 891211de30..b37cd4aab8 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -121,97 +121,113 @@ class Textile(object): btag = ('bq', 'bc', 'notextile', 'pre', 'h[1-6]', 'fn\d+', 'p') btag_lite = ('bq', 'bc', 'p') - glyph_defaults = ( - ('mac_cent', '¢'), - ('mac_pound', '£'), - ('mac_yen', '¥'), - ('mac_quarter', '¼'), - ('mac_half', '½'), - ('mac_three-quarter', '¾'), - ('mac_cA-grave', 'À'), - ('mac_cA-acute', 'Á'), - ('mac_cA-circumflex', 'Â'), - ('mac_cA-tilde', 'Ã'), - ('mac_cA-diaeresis', 'Ä'), - ('mac_cA-ring', 'Å'), - ('mac_cAE', 'Æ'), - ('mac_cC-cedilla', 'Ç'), - ('mac_cE-grave', 'È'), - ('mac_cE-acute', 'É'), - ('mac_cE-circumflex', 'Ê'), - ('mac_cE-diaeresis', 'Ë'), - ('mac_cI-grave', 'Ì'), - ('mac_cI-acute', 'Í'), - ('mac_cI-circumflex', 'Î'), - ('mac_cI-diaeresis', 'Ï'), - ('mac_cEth', 'Ð'), - ('mac_cN-tilde', 'Ñ'), - ('mac_cO-grave', 'Ò'), - ('mac_cO-acute', 'Ó'), - ('mac_cO-circumflex', 'Ô'), - ('mac_cO-tilde', 'Õ'), - ('mac_cO-diaeresis', 'Ö'), - ('mac_cO-stroke', 'Ø'), - ('mac_cU-grave', 'Ù'), - ('mac_cU-acute', 'Ú'), - ('mac_cU-circumflex', 'Û'), - ('mac_cU-diaeresis', 'Ü'), - ('mac_cY-acute', 'Ý'), - ('mac_sa-grave', 'à'), - ('mac_sa-acute', 'á'), - ('mac_sa-circumflex', 'â'), - ('mac_sa-tilde', 'ã'), - ('mac_sa-diaeresis', 'ä'), - ('mac_sa-ring', 'å'), - ('mac_sae', 'æ'), - ('mac_sc-cedilla', 'ç'), - ('mac_se-grave', 'è'), - ('mac_se-acute', 'é'), - ('mac_se-circumflex', 'ê'), - ('mac_se-diaeresis', 'ë'), - ('mac_si-grave', 'ì'), - ('mac_si-acute', 'í'), - ('mac_si-circumflex', 'î'), - ('mac_si-diaeresis', 'ï'), - ('mac_sn-tilde', 'ñ'), - ('mac_so-grave', 'ò'), - ('mac_so-acute', 'ó'), - ('mac_so-circumflex', 'ô'), - ('mac_so-tilde', 'õ'), - ('mac_so-diaeresis', 'ö'), - ('mac_so-stroke', 'ø'), - ('mac_su-grave', 'ù'), - ('mac_su-acute', 'ú'), - ('mac_su-circumflex', 'û'), - ('mac_su-diaeresis', 'ü'), - ('mac_sy-acute', 'ý'), - ('mac_sy-diaeresis', 'ÿ'), - ('mac_cOE', 'Œ'), - ('mac_soe', 'œ'), - ('mac_bullet', '•'), - ('mac_franc', '₣'), - ('mac_lira', '₤'), - ('mac_rupee', '₨'), - ('mac_euro', '€'), - ('mac_spade', '♠'), - ('mac_club', '♣'), - ('mac_heart', '♥'), - ('mac_diamond', '♦'), - ('txt_dimension', '×'), - ('txt_quote_single_open', '‘'), - ('txt_quote_single_close', '’'), - ('txt_quote_double_open', '“'), - ('txt_quote_double_close', '”'), - ('txt_apostrophe', '’'), - ('txt_prime', '′'), - ('txt_prime_double', '″'), - ('txt_ellipsis', '…'), - ('txt_emdash', '—'), - ('txt_endash', '–'), - ('txt_trademark', '™'), - ('txt_registered', '®'), - ('txt_copyright', '©'), - ) + macro_defaults = [ + (re.compile(r'{(c\||\|c)}'), r'¢'), # cent + (re.compile(r'{(L-|-L)}'), r'£'), # pound + (re.compile(r'{(Y=|=Y)}'), r'¥'), # yen + (re.compile(r'{\(c\)}'), r'©'), # copyright + (re.compile(r'{\(r\)}'), r'®'), # registered + (re.compile(r'{(\+_|_\+)}'), r'±'), # plus-minus + (re.compile(r'{1/4}'), r'¼'), # quarter + (re.compile(r'{1/2}'), r'½'), # half + (re.compile(r'{3/4}'), r'¾'), # three-quarter + (re.compile(r'{(A`|`A)}'), r'À'), # A-acute + (re.compile(r'{(A\'|\'A)}'), r'Á'), # A-grave + (re.compile(r'{(A\^|\^A)}'), r'Â'), # A-circumflex + (re.compile(r'{(A~|~A)}'), r'Ã'), # A-tilde + (re.compile(r'{(A\"|\"A)}'), r'Ä'), # A-diaeresis + (re.compile(r'{(Ao|oA)}'), r'Å'), # A-ring + (re.compile(r'{(AE)}'), r'Æ'), # AE + (re.compile(r'{(C,|,C)}'), r'Ç'), # C-cedilla + (re.compile(r'{(E`|`E)}'), r'È'), # E-acute + (re.compile(r'{(E\'|\'E)}'), r'É'), # E-grave + (re.compile(r'{(E\^|\^E)}'), r'Ê'), # E-circumflex + (re.compile(r'{(E\"|\"E)}'), r'Ë'), # E-diaeresis + (re.compile(r'{(I`|`I)}'), r'Ì'), # I-acute + (re.compile(r'{(I\'|\'I)}'), r'Í'), # I-grave + (re.compile(r'{(I\^|\^I)}'), r'Î'), # I-circumflex + (re.compile(r'{(I\"|\"I)}'), r'Ï'), # I-diaeresis + (re.compile(r'{(D-|-D)}'), r'Ð'), # ETH + (re.compile(r'{(N~|~N)}'), r'Ñ'), # N-tilde + (re.compile(r'{(O`|`O)}'), r'Ò'), # O-acute + (re.compile(r'{(O\'|\'O)}'), r'Ó'), # O-grave + (re.compile(r'{(O\^|\^O)}'), r'Ô'), # O-circumflex + (re.compile(r'{(O~|~O)}'), r'Õ'), # O-tilde + (re.compile(r'{(O\"|\"O)}'), r'Ö'), # O-diaeresis + (re.compile(r'{x}'), r'×'), # dimension + (re.compile(r'{(O\/|\/O)}'), r'Ø'), # O-slash + (re.compile(r'{(U`|`U)}'), r'Ù'), # U-acute + (re.compile(r'{(U\'|\'U)}'), r'Ú'), # U-grave + (re.compile(r'{(U\^|\^U)}'), r'Û'), # U-circumflex + (re.compile(r'{(U\"|\"U)}'), r'Ü'), # U-diaeresis + (re.compile(r'{(Y\'|\'Y)}'), r'Ý'), # Y-grave + (re.compile(r'{sz}'), r'ß'), # sharp-s + (re.compile(r'{(a`|`a)}'), r'à'), # a-grave + (re.compile(r'{(a\'|\'a)}'), r'á'), # a-acute + (re.compile(r'{(a\^|\^a)}'), r'â'), # a-circumflex + (re.compile(r'{(a~|~a)}'), r'ã'), # a-tilde + (re.compile(r'{(a\"|\"a)}'), r'ä'), # a-diaeresis + (re.compile(r'{(ao|oa)}'), r'å'), # a-ring + (re.compile(r'{ae}'), r'æ'), # ae + (re.compile(r'{(c,|,c)}'), r'ç'), # c-cedilla + (re.compile(r'{(e`|`e)}'), r'è'), # e-grave + (re.compile(r'{(e\'|\'e)}'), r'é'), # e-acute + (re.compile(r'{(e\^|\^e)}'), r'ê'), # e-circumflex + (re.compile(r'{(e\"|\"e)}'), r'ë'), # e-diaeresis + (re.compile(r'{(i`|`i)}'), r'ì'), # i-grave + (re.compile(r'{(i\'|\'i)}'), r'í'), # i-acute + (re.compile(r'{(i\^|\^i)}'), r'î'), # i-circumflex + (re.compile(r'{(i\"|\"i)}'), r'ï'), # i-diaeresis + (re.compile(r'{(d-|-d)}'), r'ð'), # eth + (re.compile(r'{(n~|~n)}'), r'ñ'), # n-tilde + (re.compile(r'{(o`|`o)}'), r'ò'), # o-grave + (re.compile(r'{(o\'|\'o)}'), r'ó'), # o-acute + (re.compile(r'{(o\^|\^o)}'), r'ô'), # o-circumflex + (re.compile(r'{(o~|~o)}'), r'õ'), # o-tilde + (re.compile(r'{(o\"|\"o)}'), r'ö'), # o-diaeresis + (re.compile(r'{(o\/|\/o)}'), r'ø'), # o-stroke + (re.compile(r'{(u`|`u)}'), r'ù'), # u-grave + (re.compile(r'{(u\'|\'u)}'), r'ú'), # u-acute + (re.compile(r'{(u\^|\^u)}'), r'û'), # u-circumflex + (re.compile(r'{(u\"|\"u)}'), r'ü'), # u-diaeresis + (re.compile(r'{(y\'|\'y)}'), r'ý'), # y-acute + (re.compile(r'{(y\"|\"y)}'), r'ÿ'), # y-diaeresis + (re.compile(r'{OE}'), r'Œ'), # OE + (re.compile(r'{oe}'), r'œ'), # oe + (re.compile(r'{(S\^|\^S)}'), r'Š'), # Scaron + (re.compile(r'{(s\^|\^s)}'), r'š'), # scaron + (re.compile(r'{\*}'), r'•'), # bullet + (re.compile(r'{Fr}'), r'₣'), # Franc + (re.compile(r'{(L=|=L)}'), r'₤'), # Lira + (re.compile(r'{Rs}'), r'₨'), # Rupee + (re.compile(r'{(C=|=C)}'), r'€'), # euro + (re.compile(r'{tm}'), r'™'), # trademark + (re.compile(r'{spade}'), r'♠'), # spade + (re.compile(r'{club}'), r'♣'), # club + (re.compile(r'{heart}'), r'♥'), # heart + (re.compile(r'{diamond}'), r'♦'), # diamond + ] + glyph_defaults = [ + (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign + (re.compile(r'(\d+)\'', re.I), r'\1′'), # prime + (re.compile(r'(\d+)\"', re.I), r'\1″'), # prime-double + (re.compile(r"(\w)\'(\w)"), r'\1’\2'), # apostrophe's + (re.compile(r'(\s)\'(\d+\w?)\b(?!\')'), r'\1’\2'), # back in '88 + (re.compile(r'(\S)\'(?=\s|\'|<|$)'), r'\1’'), # single closing + (re.compile(r'\'/'), r'‘'), # single opening + (re.compile(r'(\")\"'), r'\1”'), # double closing - following another + (re.compile(r'(\S)\"(?=\s|\"|<|$)'), r'\1”'), # double closing + (re.compile(r'"'), r'“'), # double opening + (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'\1'), # 3+ uppercase acronym + (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'\1'), # 3+ uppercase + (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1⁄'), # ellipsis + (re.compile(r'(\s?)--(\s?)'), r'\1—\2'), # em dash + (re.compile(r'\s-(?:\s|$)'), r' – '), # en dash + (re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark + (re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered + (re.compile(r'\b( ?)[([]C[])]', re.I), r'\1©'), # copyright + ] + def __init__(self, restricted=False, lite=False, noimage=False): """docstring for __init__""" @@ -673,211 +689,15 @@ class Textile(object): # fix: hackish text = re.sub(r'"\Z', '\" ', text) - glyph_search = ( - re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), # dimension sign - re.compile(r"(\w)\'(\w)"), # apostrophe's - re.compile(r'(\s)\'(\d+\w?)\b(?!\')'), # back in '88 - re.compile(r'(\S)\'(?=\s|'+self.pnct+'|<|$)'), # single closing - re.compile(r'\'/'), # single opening - re.compile(r'(\")\"'), # double closing - following another - re.compile(r'(\S)\"(?=\s|'+self.pnct+'|<|$)'), # double closing - re.compile(r'"'), # double opening - re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), # 3+ uppercase acronym - re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), # 3+ uppercase - re.compile(r'\b(\s{0,1})?\.{3}'), # ellipsis - re.compile(r'(\s?)--(\s?)'), # em dash - re.compile(r'\s-(?:\s|$)'), # en dash - re.compile(r'\b( ?)[([]TM[])]', re.I), # trademark - re.compile(r'\b( ?)[([]R[])]', re.I), # registered - re.compile(r'\b( ?)[([]C[])]', re.I) # copyright - ) - - glyph_replace = [x % dict(self.glyph_defaults) for x in ( - r'\1\2%(txt_dimension)s\3', # dimension sign - r'\1%(txt_apostrophe)s\2', # apostrophe's - r'\1%(txt_apostrophe)s\2', # back in '88 - r'\1%(txt_quote_single_close)s', # single closing - r'%(txt_quote_single_open)s', # single opening - r'\1%(txt_quote_double_close)s', # double closing - following another - r'\1%(txt_quote_double_close)s', # double closing - r'%(txt_quote_double_open)s', # double opening - r'\1', # 3+ uppercase acronym - r'\1', # 3+ uppercase - r'\1%(txt_ellipsis)s', # ellipsis - r'\1%(txt_emdash)s\2', # em dash - r' %(txt_endash)s ', # en dash - r'\1%(txt_trademark)s', # trademark - r'\1%(txt_registered)s', # registered - r'\1%(txt_copyright)s' # copyright - )] - - if re.search(r'{.+?}', text): - glyph_search += ( - re.compile(r'{(c\||\|c)}'), # cent - re.compile(r'{(L-|-L)}'), # pound - re.compile(r'{(Y=|=Y)}'), # yen - re.compile(r'{\(c\)}'), # copyright - re.compile(r'{\(r\)}'), # registered - re.compile(r'{1/4}'), # quarter - re.compile(r'{1/2}'), # half - re.compile(r'{3/4}'), # three-quarter - re.compile(r'{(A`|`A)}'), # 192; - re.compile(r'{(A\'|\'A)}'), # 193; - re.compile(r'{(A\^|\^A)}'), # 194; - re.compile(r'{(A~|~A)}'), # 195; - re.compile(r'{(A\"|\"A)}'), # 196; - re.compile(r'{(Ao|oA)}'), # 197; - re.compile(r'{(AE)}'), # 198; - re.compile(r'{(C,|,C)}'), # 199; - re.compile(r'{(E`|`E)}'), # 200; - re.compile(r'{(E\'|\'E)}'), # 201; - re.compile(r'{(E\^|\^E)}'), # 202; - re.compile(r'{(E\"|\"E)}'), # 203; - re.compile(r'{(I`|`I)}'), # 204; - re.compile(r'{(I\'|\'I)}'), # 205; - re.compile(r'{(I\^|\^I)}'), # 206; - re.compile(r'{(I\"|\"I)}'), # 207; - re.compile(r'{(D-|-D)}'), # 208; - re.compile(r'{(N~|~N)}'), # 209; - re.compile(r'{(O`|`O)}'), # 210; - re.compile(r'{(O\'|\'O)}'), # 211; - re.compile(r'{(O\^|\^O)}'), # 212; - re.compile(r'{(O~|~O)}'), # 213; - re.compile(r'{(O\"|\"O)}'), # 214; - re.compile(r'{(O\/|\/O)}'), # 215; - re.compile(r'{(U`|`U)}'), # 216; - re.compile(r'{(U\'|\'U)}'), # 217; - re.compile(r'{(U\^|\^U)}'), # 218; - re.compile(r'{(U\"|\"U)}'), # 219; - re.compile(r'{(Y\'|\'Y)}'), # 220; - re.compile(r'{(a`|`a)}'), # a-grace - re.compile(r'{(a\'|\'a)}'), # a-acute - re.compile(r'{(a\^|\^a)}'), # a-circumflex - re.compile(r'{(a~|~a)}'), # a-tilde - re.compile(r'{(a\"|\"a)}'), # a-diaeresis - re.compile(r'{(ao|oa)}'), # a-ring - re.compile(r'{ae}'), # ae - re.compile(r'{(c,|,c)}'), # c-cedilla - re.compile(r'{(e`|`e)}'), # e-grace - re.compile(r'{(e\'|\'e)}'), # e-acute - re.compile(r'{(e\^|\^e)}'), # e-circumflex - re.compile(r'{(e\"|\"e)}'), # e-diaeresis - re.compile(r'{(i`|`i)}'), # i-grace - re.compile(r'{(i\'|\'i)}'), # i-acute - re.compile(r'{(i\^|\^i)}'), # i-circumflex - re.compile(r'{(i\"|\"i)}'), # i-diaeresis - re.compile(r'{(n~|~n)}'), # n-tilde - re.compile(r'{(o`|`o)}'), # o-grace - re.compile(r'{(o\'|\'o)}'), # o-acute - re.compile(r'{(o\^|\^o)}'), # o-circumflex - re.compile(r'{(o~|~o)}'), # o-tilde - re.compile(r'{(o\"|\"o)}'), # o-diaeresis - re.compile(r'{(o\/|\/o)}'), # o-stroke - re.compile(r'{(u`|`u)}'), # u-grace - re.compile(r'{(u\'|\'u)}'), # u-acute - re.compile(r'{(u\^|\^u)}'), # u-circumflex - re.compile(r'{(u\"|\"u)}'), # u-diaeresis - re.compile(r'{(y\'|\'y)}'), # y-acute - re.compile(r'{(y\"|\"y)}'), # y-diaeresis - re.compile(r'{OE}'), # y-diaeresis - re.compile(r'{oe}'), # y-diaeresis - re.compile(r'{\*}'), # bullet - re.compile(r'{Fr}'), # Franc - re.compile(r'{(L=|=L)}'), # Lira - re.compile(r'{Rs}'), # Rupee - re.compile(r'{(C=|=C)}'), # euro - re.compile(r'{tm}'), # euro - re.compile(r'{spade}'), # spade - re.compile(r'{club}'), # club - re.compile(r'{heart}'), # heart - re.compile(r'{diamond}') # diamond - ) - - glyph_replace += [x % dict(self.glyph_defaults) for x in ( - r'%(mac_cent)s', # cent - r'%(mac_pound)s', # pound - r'%(mac_yen)s', # yen - r'%(txt_copyright)s', # copyright - r'%(txt_registered)s', # registered - r'%(mac_quarter)s', # quarter - r'%(mac_half)s', # half - r'%(mac_three-quarter)s', # three-quarter - r'%(mac_cA-grave)s', # 192; - r'%(mac_cA-acute)s', # 193; - r'%(mac_cA-circumflex)s', # 194; - r'%(mac_cA-tilde)s', # 195; - r'%(mac_cA-diaeresis)s', # 196; - r'%(mac_cA-ring)s', # 197; - r'%(mac_cAE)s', # 198; - r'%(mac_cC-cedilla)s', # 199; - r'%(mac_cE-grave)s', # 200; - r'%(mac_cE-acute)s', # 201; - r'%(mac_cE-circumflex)s', # 202; - r'%(mac_cE-diaeresis)s', # 203; - r'%(mac_cI-grave)s', # 204; - r'%(mac_cI-acute)s', # 205; - r'%(mac_cI-circumflex)s', # 206; - r'%(mac_cI-diaeresis)s', # 207; - r'%(mac_cEth)s', # 208; - r'%(mac_cN-tilde)s', # 209; - r'%(mac_cO-grave)s', # 210; - r'%(mac_cO-acute)s', # 211; - r'%(mac_cO-circumflex)s', # 212; - r'%(mac_cO-tilde)s', # 213; - r'%(mac_cO-diaeresis)s', # 214; - r'%(mac_cO-stroke)s', # 216; - r'%(mac_cU-grave)s', # 217; - r'%(mac_cU-acute)s', # 218; - r'%(mac_cU-circumflex)s', # 219; - r'%(mac_cU-diaeresis)s', # 220; - r'%(mac_cY-acute)s', # 221; - r'%(mac_sa-grave)s', # 224; - r'%(mac_sa-acute)s', # 225; - r'%(mac_sa-circumflex)s', # 226; - r'%(mac_sa-tilde)s', # 227; - r'%(mac_sa-diaeresis)s', # 228; - r'%(mac_sa-ring)s', # 229; - r'%(mac_sae)s', # 230; - r'%(mac_sc-cedilla)s', # 231; - r'%(mac_se-grave)s', # 232; - r'%(mac_se-acute)s', # 233; - r'%(mac_se-circumflex)s', # 234; - r'%(mac_se-diaeresis)s', # 235; - r'%(mac_si-grave)s', # 236; - r'%(mac_si-acute)s', # 237; - r'%(mac_si-circumflex)s', # 238; - r'%(mac_si-diaeresis)s', # 239; - r'%(mac_sn-tilde)s', # 241; - r'%(mac_so-grave)s', # 242; - r'%(mac_so-acute)s', # 243; - r'%(mac_so-circumflex)s', # 244; - r'%(mac_so-tilde)s', # 245; - r'%(mac_so-diaeresis)s', # 246; - r'%(mac_so-stroke)s', # 248; - r'%(mac_su-grave)s', # 249; - r'%(mac_su-acute)s', # 250; - r'%(mac_su-circumflex)s', # 251; - r'%(mac_su-diaeresis)s', # 252; - r'%(mac_sy-acute)s', # 253; - r'%(mac_sy-diaeresis)s', # 255; - r'%(mac_cOE)s', # 338; - r'%(mac_soe)s', # 339; - r'%(mac_bullet)s', # bullet - r'%(mac_franc)s', # franc - r'%(mac_lira)s', # lira - r'%(mac_rupee)s', # rupee - r'%(mac_euro)s', # euro - r'%(txt_trademark)s', # trademark - r'%(mac_spade)s', # spade - r'%(mac_club)s', # club - r'%(mac_heart)s', # heart - r'%(mac_diamond)s' # diamond - )] - result = [] for line in re.compile(r'(<.*?>)', re.U).split(text): if not re.search(r'<.*>', line): - for s, r in zip(glyph_search, glyph_replace): + rules = [] + if re.search(r'{.+?}', line): + rules = self.macro_defaults + self.glyph_defaults + else: + rules = self.glyph_defaults + for s, r in rules: line = s.sub(r, line) result.append(line) return ''.join(result) @@ -1045,7 +865,7 @@ class Textile(object): 'hello span strong and bold goodbye' """ qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^') - pnct = ".,\"'?!;:" + pnct = ".,\"'?!;:()" for qtag in qtags: pattern = re.compile(r""" From 9ad00b98d4f441001f6c43221289a5acb036c477 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 19 Mar 2011 15:46:51 -0400 Subject: [PATCH 02/26] TXT Input: Textile: Fix issue with double closings. --- src/calibre/ebooks/textile/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index b37cd4aab8..5e07cdaec2 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -216,7 +216,7 @@ class Textile(object): (re.compile(r'(\S)\'(?=\s|\'|<|$)'), r'\1’'), # single closing (re.compile(r'\'/'), r'‘'), # single opening (re.compile(r'(\")\"'), r'\1”'), # double closing - following another - (re.compile(r'(\S)\"(?=\s|\"|<|$)'), r'\1”'), # double closing + (re.compile(r'(\S)\"(?=\s|”|<|$)'), r'\1”'), # double closing (re.compile(r'"'), r'“'), # double opening (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'\1'), # 3+ uppercase acronym (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'\1'), # 3+ uppercase From 86255b1b107aa510137bd802c9e39f06796b7fa1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Mar 2011 14:47:41 -0600 Subject: [PATCH 03/26] Fix #9448 (Support for Archos 43 tablet) --- src/calibre/devices/android/driver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index 1ddc14bd1f..a527e8a29b 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -64,6 +64,7 @@ class ANDROID(USBMS): 0x0e79 : { 0x1400 : [0x0222, 0x0216], 0x1408 : [0x0222, 0x0216], + 0x1417 : [0x0216], 0x1419 : [0x0216], 0x1420 : [0x0216], 0x1422 : [0x0216] @@ -98,7 +99,7 @@ class ANDROID(USBMS): 'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE', 'SGH-T849', '_MB300', 'A70S', 'S_ANDROID', 'A101IT', 'A70H', 'IDEOS_TABLET', 'MYTOUCH_4G', 'UMS_COMPOSITE', 'SCH-I800_CARD', - '7', 'A956', 'A955'] + '7', 'A956', 'A955', 'A43'] WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD', 'A70S', 'A101IT', '7'] From 306a2e206c06ababc9d9577e7b20ec0f4f4fcff0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Mar 2011 15:25:18 -0600 Subject: [PATCH 04/26] ... --- src/calibre/gui2/widgets.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py index 8ebf9c2c21..4ff2562bea 100644 --- a/src/calibre/gui2/widgets.py +++ b/src/calibre/gui2/widgets.py @@ -342,6 +342,7 @@ class FontFamilyModel(QAbstractListModel): self.families = list(qt_families.intersection(set(self.families))) self.families.sort() self.families[:0] = [_('None')] + self.font = QFont('sansserif') def rowCount(self, *args): return len(self.families) @@ -354,10 +355,11 @@ class FontFamilyModel(QAbstractListModel): return NONE if role == Qt.DisplayRole: return QVariant(family) - if False and role == Qt.FontRole: - # Causes a Qt crash with some fonts - # so disabled. - return QVariant(QFont(family)) + if role == Qt.FontRole: + # If a user chooses some non standard font as the interface font, + # rendering some font names causes Qt to crash, so return what is + # hopefully a "safe" font + return QVariant(self.font) return NONE def index_of(self, family): From ec622f426d2ae00f7fd01dbb591576b1ef213f99 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Mar 2011 15:36:50 -0600 Subject: [PATCH 05/26] ... --- setup/installer/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup/installer/__init__.py b/setup/installer/__init__.py index c25334dbe4..e07586eefd 100644 --- a/setup/installer/__init__.py +++ b/setup/installer/__init__.py @@ -14,9 +14,9 @@ from setup.build_environment import HOST, PROJECT BASE_RSYNC = ['rsync', '-avz', '--delete'] EXCLUDES = [] for x in [ - 'src/calibre/plugins', 'src/calibre/manual', 'src/calibre/trac', + 'src/calibre/plugins', 'src/calibre/manual', 'src/calibre/trac', 'recipes', '.bzr', '.build', '.svn', 'build', 'dist', 'imgsrc', '*.pyc', '*.pyo', '*.swp', - '*.swo']: + '*.swo', 'format_docs']: EXCLUDES.extend(['--exclude', x]) SAFE_EXCLUDES = ['"%s"'%x if '*' in x else x for x in EXCLUDES] From 0fbc8d717b186abbe87d47e09a3bf699ef7c6bc8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Mar 2011 17:52:35 -0600 Subject: [PATCH 06/26] ... --- setup/installer/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup/installer/__init__.py b/setup/installer/__init__.py index e07586eefd..f2d598e33a 100644 --- a/setup/installer/__init__.py +++ b/setup/installer/__init__.py @@ -138,7 +138,7 @@ class VMInstaller(Command): self.vm = self.VM if not self.vmware_started(): self.start_vmware() - subprocess.call(['chmod', '-R', '+r', 'resources/recipes']) + subprocess.call(['chmod', '-R', '+r', 'recipes']) self.start_vm() self.download_installer() if not self.dont_shutdown: From c979cb10b8a91da5d061c1bdec099510d84c0cff Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Mar 2011 18:02:29 -0600 Subject: [PATCH 07/26] Windows build: Put all python files into a zip file to reduce upgrade time --- setup/installer/windows/freeze.py | 116 +++++++++++++++++++++++++++++- setup/installer/windows/site.py | 2 +- setup/installer/windows/util.c | 2 +- 3 files changed, 117 insertions(+), 3 deletions(-) diff --git a/setup/installer/windows/freeze.py b/setup/installer/windows/freeze.py index e9e47816fd..cf4dcd5f9d 100644 --- a/setup/installer/windows/freeze.py +++ b/setup/installer/windows/freeze.py @@ -6,7 +6,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, os, shutil, glob, py_compile, subprocess, re +import sys, os, shutil, glob, py_compile, subprocess, re, zipfile, time from setup import Command, modules, functions, basenames, __version__, \ __appname__ @@ -40,6 +40,13 @@ DESCRIPTIONS = { 'calibre-smtp' : 'Command line interface for sending books via email', } +def walk(dir): + ''' A nice interface to os.walk ''' + for record in os.walk(dir): + for f in record[-1]: + yield os.path.join(record[0], f) + + class Win32Freeze(Command, WixMixIn): description = 'Free windows calibre installation' @@ -63,12 +70,15 @@ class Win32Freeze(Command, WixMixIn): self.rc_template = self.j(self.d(self.a(__file__)), 'template.rc') self.py_ver = ''.join(map(str, sys.version_info[:2])) self.lib_dir = self.j(self.base, 'Lib') + self.pydlib = self.j(self.base, 'pydlib') + self.pylib = self.j(self.base, 'pylib.zip') self.initbase() self.build_launchers() self.freeze() self.embed_manifests() self.install_site_py() + self.archive_lib_dir() self.create_installer() def initbase(self): @@ -356,4 +366,108 @@ class Win32Freeze(Command, WixMixIn): dest, lib] self.run_builder(cmd) + def archive_lib_dir(self): + self.info('Putting all python code into a zip file for performance') + if os.path.exists(self.pydlib): + shutil.rmtree(self.pydlib) + os.makedirs(self.pydlib) + self.zf_timestamp = time.localtime(time.time())[:6] + self.zf_names = set() + with zipfile.ZipFile(self.pylib, 'w', zipfile.ZIP_STORED) as zf: + for x in os.listdir(self.lib_dir): + if x == 'site-packages': + continue + self.add_to_zipfile(zf, x, self.lib_dir) + + sp = self.j(self.lib_dir, 'site-packages') + handled = set(['site.pyo']) + for pth in ('PIL.pth', 'pywin32.pth'): + handled.add(pth) + shutil.copyfile(self.j(sp, pth), self.j(self.pydlib, pth)) + for d in self.get_pth_dirs(self.j(sp, pth)): + shutil.copytree(d, self.j(self.pydlib, self.b(d)), True) + handled.add(self.b(d)) + + handled.add('easy-install.pth') + for d in self.get_pth_dirs(self.j(sp, 'easy-install.pth')): + handled.add(self.b(d)) + zip_safe = self.is_zip_safe(d) + for x in os.listdir(d): + if x == 'EGG-INFO': + continue + if zip_safe: + self.add_to_zipfile(zf, x, d) + else: + absp = self.j(d, x) + dest = self.j(self.pydlib, x) + if os.path.isdir(absp): + shutil.copytree(absp, dest, True) + else: + shutil.copy2(absp, dest) + + for x in os.listdir(sp): + if x in handled or x.endswith('.egg-info'): + continue + absp = self.j(sp, x) + if os.path.isdir(absp): + if not os.listdir(absp): + continue + if self.is_zip_safe(absp): + self.add_to_zipfile(zf, x, sp) + else: + shutil.copytree(absp, self.j(self.pydlib, x), True) + else: + if x.endswith('.pyd'): + shutil.copy2(absp, self.j(self.pydlib, x)) + else: + self.add_to_zipfile(zf, x, sp) + + shutil.rmtree(self.lib_dir) + + def is_zip_safe(self, path): + for f in walk(path): + ext = os.path.splitext(f)[1].lower() + if ext in ('.pyd', '.dll', '.exe'): + return False + return True + + def get_pth_dirs(self, pth): + base = os.path.dirname(pth) + for line in open(pth).readlines(): + line = line.strip() + if not line or line.startswith('#') or line.startswith('import'): + continue + if line == 'win32\\lib': + continue + candidate = self.j(base, line) + if os.path.exists(candidate): + yield candidate + + def add_to_zipfile(self, zf, name, base, exclude=frozenset()): + abspath = self.j(base, name) + name = name.replace(os.sep, '/') + if name in self.zf_names: + raise ValueError('Already added %r to zipfile [%r]'%(name, abspath)) + zinfo = zipfile.ZipInfo(filename=name, date_time=self.zf_timestamp) + + if os.path.isdir(abspath): + if not os.listdir(abspath): + return + zinfo.external_attr = 0700 << 16 + zf.writestr(zinfo, '') + for x in os.listdir(abspath): + if x not in exclude: + self.add_to_zipfile(zf, name + os.sep + x, base) + else: + ext = os.path.splitext(name)[1].lower() + if ext in ('.pyd', '.dll', '.exe'): + raise ValueError('Cannot add %r to zipfile'%abspath) + zinfo.external_attr = 0600 << 16 + if ext in ('.py', '.pyc', '.pyo'): + with open(abspath, 'rb') as f: + zf.writestr(zinfo, f.read()) + + self.zf_names.add(name) + + diff --git a/setup/installer/windows/site.py b/setup/installer/windows/site.py index 0e770f3253..5610ff197e 100644 --- a/setup/installer/windows/site.py +++ b/setup/installer/windows/site.py @@ -96,7 +96,7 @@ def main(): abs__file__() - addsitedir(os.path.join(sys.app_dir, 'Lib', 'site-packages')) + addsitedir(os.path.join(sys.app_dir, 'pydlib')) add_calibre_vars() diff --git a/setup/installer/windows/util.c b/setup/installer/windows/util.c index fdec6d786f..329e3bf8c3 100644 --- a/setup/installer/windows/util.c +++ b/setup/installer/windows/util.c @@ -198,7 +198,7 @@ void initialize_interpreter(wchar_t *outr, wchar_t *errr, buf[strlen(buf)-1] = '\0'; _snprintf_s(python_home, MAX_PATH, _TRUNCATE, "%s", buf); - _snprintf_s(path, 3*MAX_PATH, _TRUNCATE, "%s\\DLLs;%s\\Lib;%s\\Lib\\site-packages", + _snprintf_s(path, 3*MAX_PATH, _TRUNCATE, "%s\\pylib.zip;%s\\pydlib;%s\\DLLs", buf, buf, buf); free(buf); From 9e575a11b42de4bb2af8ac607999af0d891dd791 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Mar 2011 18:07:13 -0600 Subject: [PATCH 08/26] ... --- setup/installer/windows/wix-template.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup/installer/windows/wix-template.xml b/setup/installer/windows/wix-template.xml index 37dd8b25a8..9892041fee 100644 --- a/setup/installer/windows/wix-template.xml +++ b/setup/installer/windows/wix-template.xml @@ -154,9 +154,9 @@ - + From 833eb4f28572e451b5cb612d88bd1674c2e95236 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 20 Mar 2011 10:20:39 -0600 Subject: [PATCH 09/26] Fix #9459 (T-Mobile Optimus T (LG P509) not detected) --- src/calibre/devices/android/driver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index a527e8a29b..e2ed159008 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -58,7 +58,7 @@ class ANDROID(USBMS): 0x413c : { 0xb007 : [0x0100, 0x0224]}, # LG - 0x1004 : { 0x61cc : [0x100], 0x61ce : [0x100] }, + 0x1004 : { 0x61cc : [0x100], 0x61ce : [0x100], 0x618e : [0x226] }, # Archos 0x0e79 : { @@ -92,14 +92,14 @@ class ANDROID(USBMS): VENDOR_NAME = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER', 'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS', - 'TELECHIP', 'HUAWEI', 'T-MOBILE', 'SEMC'] + 'TELECHIP', 'HUAWEI', 'T-MOBILE', 'SEMC', 'LGE'] WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE', '__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897', 'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE', 'SGH-T849', '_MB300', 'A70S', 'S_ANDROID', 'A101IT', 'A70H', 'IDEOS_TABLET', 'MYTOUCH_4G', 'UMS_COMPOSITE', 'SCH-I800_CARD', - '7', 'A956', 'A955', 'A43'] + '7', 'A956', 'A955', 'A43', 'ANDROID_PLATFORM'] WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD', 'A70S', 'A101IT', '7'] From 73d13c3f6a8c405da0412f194976be7591641c56 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 20 Mar 2011 10:23:41 -0600 Subject: [PATCH 10/26] ... --- src/calibre/gui2/widgets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py index 4ff2562bea..c570a6e159 100644 --- a/src/calibre/gui2/widgets.py +++ b/src/calibre/gui2/widgets.py @@ -317,7 +317,7 @@ class CoverView(QGraphicsView, ImageDropMixin): ImageDropMixin.__init__(self) def get_pixmap(self): - for item in self.scene().items(): + for item in self.scene.items(): if hasattr(item, 'pixmap'): return item.pixmap() From 3de3f3d4fbda16be5da6d45f597bcab00c61ed04 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 20 Mar 2011 10:53:11 -0600 Subject: [PATCH 11/26] ... --- session.vim | 2 +- src/calibre/ebooks/rtf/input.py | 1 + src/calibre/utils/wmf/parse.py | 5 +++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/session.vim b/session.vim index f2adf71de9..fa14a92fba 100644 --- a/session.vim +++ b/session.vim @@ -18,6 +18,6 @@ def recipe_title_callback(raw): return eval(raw.decode('utf-8')) vipy.session.add_content_browser('.r', ',r', 'Recipe', - vipy.session.glob_based_iterator(os.path.join(project_dir, 'resources', 'recipes', '*.recipe')), + vipy.session.glob_based_iterator(os.path.join(project_dir, 'recipes', '*.recipe')), vipy.session.regexp_based_matcher(r'title\s*=\s*(?P.+)', 'title', recipe_title_callback)) EOFPY diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 52f6feb071..1594b2fbce 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -22,6 +22,7 @@ border_style_map = { 'dot-dot-dash': 'dotted', 'outset': 'outset', 'tripple': 'double', + 'triple': 'double', 'thick-thin-small': 'solid', 'thin-thick-small': 'solid', 'thin-thick-thin-small': 'solid', diff --git a/src/calibre/utils/wmf/parse.py b/src/calibre/utils/wmf/parse.py index c618884e33..9dc035d3e1 100644 --- a/src/calibre/utils/wmf/parse.py +++ b/src/calibre/utils/wmf/parse.py @@ -251,12 +251,12 @@ class WMF(object): img.load(bmp) return img.export('png') -def wmf_unwrap(wmf_data): +def wmf_unwrap(wmf_data, verbose=0): ''' Return the largest embedded raster image in the WMF. The returned data is in PNG format. ''' - w = WMF() + w = WMF(verbose=verbose) w(wmf_data) if not w.has_raster_image: raise ValueError('No raster image found in the WMF') @@ -266,4 +266,5 @@ if __name__ == '__main__': wmf = WMF(verbose=4) wmf(open(sys.argv[-1], 'rb')) open('/t/test.bmp', 'wb').write(wmf.bitmaps[0]) + open('/t/test.png', 'wb').write(wmf.to_png()) From 517a3e397575cd34fc77046a79e8d3f29d025ee5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 20 Mar 2011 11:18:18 -0600 Subject: [PATCH 12/26] Fix Washington Post --- recipes/wash_post.recipe | 68 ++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/recipes/wash_post.recipe b/recipes/wash_post.recipe index fb6d5bc598..3af89d502e 100644 --- a/recipes/wash_post.recipe +++ b/recipes/wash_post.recipe @@ -1,4 +1,3 @@ -import re from calibre.web.feeds.news import BasicNewsRecipe @@ -6,55 +5,48 @@ class WashingtonPost(BasicNewsRecipe): title = 'Washington Post' description = 'US political news' - __author__ = 'Kovid Goyal and Sujata Raman' + __author__ = 'Kovid Goyal' use_embedded_content = False max_articles_per_feed = 20 language = 'en' + encoding = 'utf-8' remove_javascript = True no_stylesheets = True - extra_css = ''' - #articleCopyright { font-family:Arial,helvetica,sans-serif ; font-weight:bold ; font-size:x-small ;} - p { font-family:"Times New Roman",times,serif ; font-weight:normal ; font-size:small ;} - body{font-family:arial,helvetica,sans-serif} - ''' - - feeds = [ ('Today\'s Highlights', 'http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/03/24/LI2005032400102.xml'), - ('Politics', 'http://www.washingtonpost.com/wp-dyn/rss/politics/index.xml'), - ('Nation', 'http://www.washingtonpost.com/wp-dyn/rss/nation/index.xml'), - ('World', 'http://www.washingtonpost.com/wp-dyn/rss/world/index.xml'), - ('Business', 'http://www.washingtonpost.com/wp-dyn/rss/business/index.xml'), - ('Technology', 'http://www.washingtonpost.com/wp-dyn/rss/technology/index.xml'), - ('Health', 'http://www.washingtonpost.com/wp-dyn/rss/health/index.xml'), - ('Education', 'http://www.washingtonpost.com/wp-dyn/rss/education/index.xml'), - ('Style', - 'http://www.washingtonpost.com/wp-dyn/rss/print/style/index.xml'), - ('NFL Sports', - 'http://www.washingtonpost.com/wp-dyn/rss/sports/index/nfl/index.xml'), - ('Redskins', 'http://www.washingtonpost.com/wp-dyn/rss/sports/redskins/index.xml'), - ('Editorials', 'http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/05/30/LI2005053000331.xml'), + feeds = [ + ('Politics', 'http://www.washingtonpost.com/rss/politics'), + ('Nation', 'http://www.washingtonpost.com/rss/national'), + ('World', 'http://www.washingtonpost.com/rss/world'), + ('Business', 'http://www.washingtonpost.com/rss/business'), + ('Lifestyle', 'http://www.washingtonpost.com/rss/lifestyle'), + ('Sports', 'http://www.washingtonpost.com/rss/sports'), + ('Redskins', 'http://www.washingtonpost.com/rss/sports/redskins'), + ('Opinions', 'http://www.washingtonpost.com/rss/opinions'), + ('Entertainment', 'http://www.washingtonpost.com/rss/entertainment'), + ('Local', 'http://www.washingtonpost.com/rss/local'), + ('Investigations', + 'http://www.washingtonpost.com/rss/investigations'), ] - remove_tags = [{'id':['pfmnav', 'ArticleCommentsWrapper']}] + remove_tags = [ + {'class':lambda x: x and 'article-toolbar' in x}, + {'class':lambda x: x and 'quick-comments' in x}, + {'class':lambda x: x and 'tweet' in x}, + {'class':lambda x: x and 'article-related' in x}, + {'class':lambda x: x and 'hidden' in x.split()}, + {'class':lambda x: x and 'also-read' in x.split()}, + {'class':lambda x: x and 'partners-content' in x.split()}, + {'class':['module share', 'module ads', 'comment-vars', 'hidden', + 'share-icons-wrap', 'comments']}, + {'id':['right-rail']}, + ] + keep_only_tags = dict(id=['content', 'article']) - def get_article_url(self, article): - return article.get('guid', article.get('link', None)) def print_version(self, url): - return url.rpartition('.')[0] + '_pf.html' + url = url.rpartition('?')[0] + return url.replace('_story.html', '_singlePage.html') - def postprocess_html(self, soup, first): - for div in soup.findAll(name='div', style=re.compile('margin')): - div['style'] = '' - return soup - - def preprocess_html(self, soup): - for tag in soup.findAll('font'): - if tag.has_key('size'): - if tag['size'] == '+2': - if tag.b: - return soup - return None From 893035b874434a512f8fc54eeb4596c46dacae38 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 20 Mar 2011 11:21:01 -0600 Subject: [PATCH 13/26] Fix Christian Science Monitor --- recipes/chr_mon.recipe | 66 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/recipes/chr_mon.recipe b/recipes/chr_mon.recipe index 2b431ebd0b..6f41b95763 100644 --- a/recipes/chr_mon.recipe +++ b/recipes/chr_mon.recipe @@ -8,13 +8,13 @@ __description__ = 'Providing context and clarity on national and international n '''csmonitor.com''' - import re from calibre.web.feeds.news import BasicNewsRecipe + class ChristianScienceMonitor(BasicNewsRecipe): - author = 'Kovid Goyal, Sujata Raman and Lorenzo Vigentini' + __author__ = 'Kovid Goyal' description = 'Providing context and clarity on national and international news, peoples and cultures' cover_url = 'http://www.csmonitor.com/extension/csm_base/design/csm_design/images/csmlogo_179x46.gif' @@ -34,6 +34,49 @@ class ChristianScienceMonitor(BasicNewsRecipe): remove_javascript = True no_stylesheets = True + def append_page(self, soup, appendtag, position): + nav = soup.find('div',attrs={'class':'navigation'}) + if nav: + pager = nav.findAll('a') + for part in pager: + if 'Next' in part: + nexturl = ('http://www.csmonitor.com' + + re.findall(r'href="(.*?)"', str(part))[0]) + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', + attrs={'class': re.compile('list-article-.*')}) + trash_c = soup2.findAll(attrs={'class': 'list-description'}) + trash_h = soup2.h1 + for tc in trash_c: tc.extract() + trash_h.extract() + + newpos = len(texttag.contents) + self.append_page(soup2, texttag, newpos) + texttag.extract() + appendtag.insert(position, texttag) + + def preprocess_html(self, soup): + PRINT_RE = re.compile(r'/layout/set/print/content/view/print/[0-9]*') + html = str(soup) + try: + print_found = PRINT_RE.findall(html) + except Exception: + pass + if print_found: + print_url = 'http://www.csmonitor.com' + print_found[0] + print_soup = self.index_to_soup(print_url) + else: + self.append_page(soup, soup.body, 3) + + trash_a = soup.findAll(attrs={'class': re.compile('navigation.*')}) + trash_b = soup.findAll(attrs={'style': re.compile('.*')}) + trash_d = soup.findAll(attrs={'class': 'sByline'}) + for ta in trash_a: ta.extract() + for tb in trash_b: tb.extract() + for td in trash_d: td.extract() + + print_soup = soup + return print_soup preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ @@ -43,7 +86,6 @@ class ChristianScienceMonitor(BasicNewsRecipe): (r'Full HTML version of this story which may include photos, graphics, and related links.*</body>', lambda match : '</body>'), ]] - extra_css = ''' h1{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: large} .sub{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: small;} @@ -56,10 +98,9 @@ class ChristianScienceMonitor(BasicNewsRecipe): #main{font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size: small; } #photo-details{ font-family:Arial,Helvetica,sans-serif ; color:#999999; font-size: x-small;} span.name{color:#205B87;font-family: Georgia,Times,"Times New Roman",serif; font-size: x-small} - p#dateline{color:#444444 ; font-family:Arial,Helvetica,sans-serif ; font-style:italic;} - ''' - feeds = [ - (u'Top Stories' , u'http://rss.csmonitor.com/feeds/top'), + p#dateline{color:#444444 ; font-family:Arial,Helvetica,sans-serif ; font-style:italic;} ''' + + feeds = [(u'Top Stories', u'http://rss.csmonitor.com/feeds/top'), (u'World' , u'http://rss.csmonitor.com/feeds/world'), (u'USA' , u'http://rss.csmonitor.com/feeds/usa'), (u'Commentary' , u'http://rss.csmonitor.com/feeds/commentary'), @@ -74,9 +115,7 @@ class ChristianScienceMonitor(BasicNewsRecipe): (u'Home Forum' , u'http://rss.csmonitor.com/feeds/homeforum') ] - keep_only_tags = [ - dict(name='div', attrs={'id':'mainColumn'}), - ] + keep_only_tags = [dict(name='div', attrs={'id':'mainColumn'}), ] remove_tags = [ dict(name='div', attrs={'id':['story-tools','videoPlayer','storyRelatedBottom','enlarge-photo','photo-paginate']}), @@ -86,7 +125,10 @@ class ChristianScienceMonitor(BasicNewsRecipe): 'hide', 'podBrdr']}), dict(name='ul', attrs={'class':[ 'centerliststories']}) , dict(name='form', attrs={'id':[ 'commentform']}) , + dict(name='div', attrs={'class': ['ui-comments']}) ] - remove_tags_after = [ dict(name='div', attrs={'class':[ 'ad csmAd']})] - + remove_tags_after = [ dict(name='div', attrs={'class':[ 'ad csmAd']}), + dict(name='div', attrs={'class': [re.compile('navigation.*')]}), + dict(name='div', attrs={'style': [re.compile('.*')]}) + ] From d5166ed7bc9e764cb7dca315588100b2b3fbdd82 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 20 Mar 2011 19:10:15 -0400 Subject: [PATCH 14/26] TXT Input: Textile: More tweaks. --- src/calibre/ebooks/textile/functions.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index 5e07cdaec2..88d27b036d 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -202,26 +202,31 @@ class Textile(object): (re.compile(r'{Rs}'), r'₨'), # Rupee (re.compile(r'{(C=|=C)}'), r'€'), # euro (re.compile(r'{tm}'), r'™'), # trademark - (re.compile(r'{spade}'), r'♠'), # spade - (re.compile(r'{club}'), r'♣'), # club - (re.compile(r'{heart}'), r'♥'), # heart - (re.compile(r'{diamond}'), r'♦'), # diamond + (re.compile(r'{spades?}'), r'♠'), # spade + (re.compile(r'{clubs?}'), r'♣'), # club + (re.compile(r'{hearts?}'), r'♥'), # heart + (re.compile(r'{diam(onds?|s)}'), r'♦'), # diamond ] glyph_defaults = [ (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign (re.compile(r'(\d+)\'', re.I), r'\1′'), # prime (re.compile(r'(\d+)\"', re.I), r'\1″'), # prime-double + (re.compile(r'(\')\''), r'\1’'), # single closing - following another (re.compile(r"(\w)\'(\w)"), r'\1’\2'), # apostrophe's (re.compile(r'(\s)\'(\d+\w?)\b(?!\')'), r'\1’\2'), # back in '88 - (re.compile(r'(\S)\'(?=\s|\'|<|$)'), r'\1’'), # single closing - (re.compile(r'\'/'), r'‘'), # single opening + (re.compile(r'(\s\[)\''), r'\1‘'), # single opening - following ws+[ + (re.compile(r'(\S)\'(?=\s|'+pnct+'|<|$)', re.M), r'\1’'), # single closing + (re.compile(r'\''), r'‘'), # single opening (re.compile(r'(\")\"'), r'\1”'), # double closing - following another - (re.compile(r'(\S)\"(?=\s|”|<|$)'), r'\1”'), # double closing + (re.compile(r'(\s\[)\"'), r'\1“'), # double opening - following whitespace+[ + (re.compile(r'(\S)\"(?=\s|'+pnct+'|<|$)', re.M), r'\1”'), # double closing (re.compile(r'"'), r'“'), # double opening (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'<acronym title="\2">\1</acronym>'), # 3+ uppercase acronym (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'<span class="caps">\1</span>'), # 3+ uppercase - (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1⁄'), # ellipsis - (re.compile(r'(\s?)--(\s?)'), r'\1—\2'), # em dash + (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis + (re.compile(r'^[\*_-]{3,}$', re.M), r'<hr />'), # <hr> scene-break + (re.compile(r'\b--\b'), r'—'), # em dash + (re.compile(r'(\s)--(\s)'), r'\1—\2'), # em dash (re.compile(r'\s-(?:\s|$)'), r' – '), # en dash (re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark (re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered @@ -747,7 +752,7 @@ class Textile(object): return url def shelve(self, text): - id = str(uuid.uuid4()) + id = str(uuid.uuid4()) + 'c' self.shelf[id] = text return id From 32703cb261e843709f27150d575b59fccfa5c780 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 20 Mar 2011 21:09:18 -0600 Subject: [PATCH 15/26] Fix #739120 (Migrate calibre bzr commit plugin to launchpad) --- src/calibre/trac/bzr_commit_plugin.py | 107 ++++++++------------------ 1 file changed, 32 insertions(+), 75 deletions(-) diff --git a/src/calibre/trac/bzr_commit_plugin.py b/src/calibre/trac/bzr_commit_plugin.py index 325bac7a79..c70e6fbf13 100644 --- a/src/calibre/trac/bzr_commit_plugin.py +++ b/src/calibre/trac/bzr_commit_plugin.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' @@ -8,114 +8,71 @@ __docformat__ = 'restructuredtext en' Plugin to make the commit command automatically close bugs when the commit message contains `Fix #number` or `Implement #number`. Also updates the commit message with the summary of the closed bug. It also set the `--fixes` metadata -appropriately. Currently only works with a Trac bug repository with the XMLRPC -plugin enabled. - -To use copy this file into `~/.bazaar/plugins` and add the following to branch.conf -in the working tree you want to use it with:: - - trac_reponame_url = <url> - trac_reponame_username = <username> - trac_reponame_password = <password> +appropriately. ''' -import os, re, xmlrpclib, subprocess -from bzrlib.builtins import cmd_commit as _cmd_commit, tree_files -from bzrlib import branch +import re, urllib, importlib, sys +from bzrlib.builtins import cmd_commit as _cmd_commit import bzrlib +from lxml import html + +SENDMAIL = ('/home/kovid/work/kde', 'pgp_mail') class cmd_commit(_cmd_commit): - @classmethod - def trac_url(self, username, password, url): - return url.replace('//', '//%s:%s@'%(username, password))+'/login/xmlrpc' - - def get_trac_summary(self, bug, url): - print 'Getting bug summary for bug #%s'%bug, - server = xmlrpclib.ServerProxy(url) - attributes = server.ticket.get(int(bug))[-1] - print attributes['summary'] - return attributes['summary'] - - def expand_bug(self, msg, nick, config, bug_tracker, type='trac'): - prefix = '%s_%s_'%(type, nick) - username = config.get_user_option(prefix+'username') - password = config.get_user_option(prefix+'password') - close_bug = config.get_user_option(prefix+'pattern') - if close_bug is None: - close_bug = r'(Fix|Implement|Fixes|Fixed|Implemented)\s+#(\d+)' + def expand_bug(self, msg): + close_bug = r'(Fix|Implement|Fixes|Fixed|Implemented)\s+#(\d+)' close_bug_pat = re.compile(close_bug, re.IGNORECASE) match = close_bug_pat.search(msg) if not match: return msg, None, None, None action, bug = match.group(1), match.group(2) summary = '' - if type == 'trac': - url = self.trac_url(username, password, bug_tracker) - summary = self.get_trac_summary(bug, url) + raw = urllib.urlopen('https://bugs.launchpad.net/calibre/+bug/' + + bug).read() + h1 = html.fromstring(raw).xpath('//h1[@id="edit-title"]')[0] + summary = html.tostring(h1, method='text', encoding=unicode).strip() + print 'Working on bug:', summary if summary: msg = msg.replace('#%s'%bug, '#%s (%s)'%(bug, summary)) msg = msg.replace('Fixesed', 'Fixed') - return msg, bug, url, action - - - def get_bugtracker(self, basedir, type='trac'): - config = os.path.join(basedir, '.bzr', 'branch', 'branch.conf') - bugtracker, nick = None, None - if os.access(config, os.R_OK): - for line in open(config).readlines(): - match = re.search(r'%s_(\S+)_url\s*=\s*(\S+)'%type, line) - if match: - nick, bugtracker = match.group(1), match.group(2) - break - return nick, bugtracker - - def expand_message(self, msg, tree): - nick, bugtracker = self.get_bugtracker(tree.basedir, type='trac') - if not bugtracker: - return msg - config = branch.Branch.open(tree.basedir).get_config() - msg, bug, url, action = self.expand_bug(msg, nick, config, bugtracker) - - return msg, bug, url, action, nick, config + return msg, bug, action def run(self, message=None, file=None, verbose=False, selected_list=None, unchanged=False, strict=False, local=False, fixes=None, author=None, show_diff=False, exclude=None): - nick = config = bug = action = None + bug = action = None if message: - try: - message, bug, url, action, nick, config = \ - self.expand_message(message, tree_files(selected_list)[0]) - except ValueError: - pass + message, bug, action = self.expand_bug(message) - if nick and bug and not fixes: - fixes = [nick+':'+bug] + if bug and not fixes: + fixes = ['lp:'+bug] ret = _cmd_commit.run(self, message=message, file=file, verbose=verbose, selected_list=selected_list, unchanged=unchanged, strict=strict, local=local, fixes=fixes, author=author, show_diff=show_diff, exclude=exclude) - if message and bug and action and nick and config: - self.close_bug(bug, action, url, config) + if message and bug and action: + self.close_bug(bug, action) return ret - def close_bug(self, bug, action, url, config): + def close_bug(self, bug, action): print 'Closing bug #%s'% bug #nick = config.get_nickname() - suffix = config.get_user_option('bug_close_comment') - if suffix is None: - suffix = 'The fix will be in the next release.' + suffix = ('The fix will be in the next release.' + 'calibre is usually released every Friday.') action = action+'ed' msg = '%s in branch %s. %s'%(action, 'lp:calibre', suffix) msg = msg.replace('Fixesed', 'Fixed') - server = xmlrpclib.ServerProxy(url) - server.ticket.update(int(bug), msg, - {'status':'closed', 'resolution':'fixed'}, - True) - subprocess.Popen('/home/kovid/work/kde/mail.py -f --delay 10'.split()) + msg += '\n\n status fixreleased' + + sys.path.insert(0, SENDMAIL[0]) + + sendmail = importlib.import_module(SENDMAIL[1]) + + to = bug+'@bugs.launchpad.net' + sendmail.sendmail(msg, to, 'Re: calibre bug '+bug) bzrlib.commands.register_command(cmd_commit) From 4c5a6213d95264ab8b8412f6a1826249e754c479 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 20 Mar 2011 22:25:08 -0600 Subject: [PATCH 16/26] Have the donate button go to calibre-ebook.com instead fo a custom HTML page --- src/calibre/gui2/ui.py | 41 ++++----------------------- src/calibre/trac/bzr_commit_plugin.py | 2 +- 2 files changed, 6 insertions(+), 37 deletions(-) diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py index 7b94c1e821..54f0bd3517 100644 --- a/src/calibre/gui2/ui.py +++ b/src/calibre/gui2/ui.py @@ -12,18 +12,17 @@ __docformat__ = 'restructuredtext en' import collections, os, sys, textwrap, time, gc from Queue import Queue, Empty from threading import Thread -from PyQt4.Qt import Qt, SIGNAL, QTimer, QHelpEvent, QAction, \ - QMenu, QIcon, pyqtSignal, \ - QDialog, QSystemTrayIcon, QApplication, QKeySequence +from PyQt4.Qt import (Qt, SIGNAL, QTimer, QHelpEvent, QAction, + QMenu, QIcon, pyqtSignal, QUrl, + QDialog, QSystemTrayIcon, QApplication, QKeySequence) from calibre import prints from calibre.constants import __appname__, isosx -from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.config import prefs, dynamic from calibre.utils.ipc.server import Server from calibre.library.database2 import LibraryDatabase2 from calibre.customize.ui import interface_actions -from calibre.gui2 import error_dialog, GetMetadata, open_local_file, \ +from calibre.gui2 import error_dialog, GetMetadata, open_url, \ gprefs, max_available_height, config, info_dialog, Dispatcher, \ question_dialog from calibre.gui2.cover_flow import CoverFlowMixin @@ -567,37 +566,7 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{ QApplication.instance().quit() def donate(self, *args): - BUTTON = ''' - <form action="https://www.paypal.com/cgi-bin/webscr" method="post"> - <input type="hidden" name="cmd" value="_s-xclick" /> - <input type="hidden" name="hosted_button_id" value="3029467" /> - <input type="image" src="https://www.paypal.com/en_US/i/btn/btn_donateCC_LG.gif" border="0" name="submit" alt="Donate to support calibre development" /> - <img alt="" border="0" src="https://www.paypal.com/en_US/i/scr/pixel.gif" width="1" height="1" /> - </form> - ''' - MSG = _('is the result of the efforts of many volunteers from all ' - 'over the world. If you find it useful, please consider ' - 'donating to support its development. Your donation helps ' - 'keep calibre development going.') - HTML = u''' - <html> - <head> - <meta http-equiv="Content-Type" content="text/html;charset=utf-8" /> - <title>Donate to support calibre - - -
calibre
-

Calibre %s

- %s - - - '''%(P('content_server/calibre_banner.png').replace(os.sep, '/'), MSG, BUTTON) - pt = PersistentTemporaryFile('_donate.htm') - pt.write(HTML.encode('utf-8')) - pt.close() - open_local_file(pt.name) - + open_url(QUrl('http://calibre-ebook.com/donate')) def confirm_quit(self): if self.job_manager.has_jobs(): diff --git a/src/calibre/trac/bzr_commit_plugin.py b/src/calibre/trac/bzr_commit_plugin.py index c70e6fbf13..c70e8db703 100644 --- a/src/calibre/trac/bzr_commit_plugin.py +++ b/src/calibre/trac/bzr_commit_plugin.py @@ -26,7 +26,7 @@ class cmd_commit(_cmd_commit): close_bug_pat = re.compile(close_bug, re.IGNORECASE) match = close_bug_pat.search(msg) if not match: - return msg, None, None, None + return msg, None, None action, bug = match.group(1), match.group(2) summary = '' raw = urllib.urlopen('https://bugs.launchpad.net/calibre/+bug/' + From bba6e03a118d1ba3ffa241179029e740efaa46a5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 20 Mar 2011 22:29:03 -0600 Subject: [PATCH 17/26] Add the keyboard shortcut: Ctrl+Shift+R to restart calibre in debug mode --- src/calibre/gui2/actions/preferences.py | 5 ++++- src/calibre/manual/gui.rst | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/actions/preferences.py b/src/calibre/gui2/actions/preferences.py index ee52f06aac..6615f5c017 100644 --- a/src/calibre/gui2/actions/preferences.py +++ b/src/calibre/gui2/actions/preferences.py @@ -25,8 +25,11 @@ class PreferencesAction(InterfaceAction): self.gui.run_wizard) if not DEBUG: pm.addSeparator() - pm.addAction(QIcon(I('debug.png')), _('Restart in debug mode'), + ac = pm.addAction(QIcon(I('debug.png')), _('Restart in debug mode'), self.debug_restart) + ac.setShortcut('Ctrl+Shift+R') + self.gui.addAction(ac) + self.qaction.setMenu(pm) self.preferences_menu = pm for x in (self.gui.preferences_action, self.qaction): diff --git a/src/calibre/manual/gui.rst b/src/calibre/manual/gui.rst index 158bd81e50..3ef1518209 100644 --- a/src/calibre/manual/gui.rst +++ b/src/calibre/manual/gui.rst @@ -549,6 +549,8 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes - Download metadata and shortcuts * - :kbd:`Ctrl+R` - Restart calibre + * - :kbd:`Ctrl+Shift+R` + - Restart calibre in debug mode * - :kbd:`Shift+Ctrl+E` - Add empty books to calibre * - :kbd:`Ctrl+Q` From de1e2369b3f7ce128764fb2e2cee6d7ba356084a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Mar 2011 09:33:46 -0600 Subject: [PATCH 18/26] Fix #739212 (new Android device ids) --- src/calibre/devices/android/driver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index e2ed159008..26039f16ef 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -64,6 +64,7 @@ class ANDROID(USBMS): 0x0e79 : { 0x1400 : [0x0222, 0x0216], 0x1408 : [0x0222, 0x0216], + 0x1411 : [0x216], 0x1417 : [0x0216], 0x1419 : [0x0216], 0x1420 : [0x0216], From 9385758a28f6257fe7a4adb263fddfc7ed924888 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Mar 2011 10:09:09 -0600 Subject: [PATCH 19/26] News download: Handle titles with ASCII control codes in them. Fixes #739322 (News fetching - NULL bytes problem) --- src/calibre/utils/cleantext.py | 11 +++++++---- src/calibre/web/feeds/__init__.py | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py index 89101a6219..27e667612e 100644 --- a/src/calibre/utils/cleantext.py +++ b/src/calibre/utils/cleantext.py @@ -8,15 +8,18 @@ import re, htmlentitydefs _ascii_pat = None def clean_ascii_chars(txt, charlist=None): - ''' - Remove ASCII control chars: 0 to 8 and 11, 12, 14-31 by default - This is all control chars except \\t,\\n and \\r + r''' + Remove ASCII control chars. + This is all control chars except \t, \n and \r ''' if not txt: return '' global _ascii_pat if _ascii_pat is None: - chars = list(range(8)) + [0x0B, 0x0C] + list(range(0x0E, 0x1F)) + chars = set(xrange(32)) + chars.add(127) + for x in (9, 10, 13): + chars.remove(x) _ascii_pat = re.compile(u'|'.join(map(unichr, chars))) if charlist is None: diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index cddb776b4c..a10fb03f91 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -28,6 +28,7 @@ class Article(object): pass if not isinstance(self._title, unicode): self._title = self._title.decode('utf-8', 'replace') + self._title = clean_ascii_chars(self._title) self.url = url self.author = author if author and not isinstance(author, unicode): @@ -75,7 +76,7 @@ class Article(object): t = t.decode('utf-8', 'replace') return t def fset(self, val): - self._title = val + self._title = clean_ascii_chars(val) return property(fget=fget, fset=fset) From 738f5b66e65a23f4a5cc8716fb90d8c69e1c2e7a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Mar 2011 10:38:56 -0600 Subject: [PATCH 20/26] Fix #739484 (Blackberry OS6) --- src/calibre/devices/blackberry/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/blackberry/driver.py b/src/calibre/devices/blackberry/driver.py index e816883957..1ae6a6c49f 100644 --- a/src/calibre/devices/blackberry/driver.py +++ b/src/calibre/devices/blackberry/driver.py @@ -19,7 +19,7 @@ class BLACKBERRY(USBMS): VENDOR_ID = [0x0fca] PRODUCT_ID = [0x8004, 0x0004] - BCD = [0x0200, 0x0107, 0x0210, 0x0201, 0x0211] + BCD = [0x0200, 0x0107, 0x0210, 0x0201, 0x0211, 0x0220] VENDOR_NAME = 'RIM' WINDOWS_MAIN_MEM = 'BLACKBERRY_SD' From d12b40a18e52d6cd87582d6d216402c82caa381a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Mar 2011 12:14:15 -0600 Subject: [PATCH 21/26] Fix regression that broke dropping lots of books onto items in the Tag Browser --- src/calibre/gui2/tag_view.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/calibre/gui2/tag_view.py b/src/calibre/gui2/tag_view.py index 5423e546ea..34fa3a8b10 100644 --- a/src/calibre/gui2/tag_view.py +++ b/src/calibre/gui2/tag_view.py @@ -16,8 +16,7 @@ from PyQt4.Qt import Qt, QTreeView, QApplication, pyqtSignal, QFont, QSize, \ QIcon, QPoint, QVBoxLayout, QHBoxLayout, QComboBox, QTimer,\ QAbstractItemModel, QVariant, QModelIndex, QMenu, QFrame,\ QPushButton, QWidget, QItemDelegate, QString, QLabel, \ - QShortcut, QKeySequence, SIGNAL, QMimeData, QSizePolicy,\ - QToolButton + QShortcut, QKeySequence, SIGNAL, QMimeData, QToolButton from calibre.ebooks.metadata import title_sort from calibre.gui2 import config, NONE, gprefs @@ -1052,12 +1051,12 @@ class TagsModel(QAbstractItemModel): # {{{ if (key == 'authors' and len(ids) >= 5): if not confirm('

'+_('Changing the authors for several books can ' 'take a while. Are you sure?') - +'

', 'tag_browser_drop_authors', self.parent()): + +'

', 'tag_browser_drop_authors', self.tags_view): return elif len(ids) > 15: if not confirm('

'+_('Changing the metadata for that many books ' 'can take a while. Are you sure?') - +'

', 'tag_browser_many_changes', self.parent()): + +'

', 'tag_browser_many_changes', self.tags_view): return fm = self.db.metadata_for_field(key) From 83c8257a146ab75f6453a826ebdc0b30ea3f788b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Mar 2011 15:50:21 -0600 Subject: [PATCH 22/26] Conversion: Detect and remove fake page margins that are specified as a margin on every paragraph. This can be turned off via an option under Structure Detection --- src/calibre/ebooks/conversion/cli.py | 5 +- src/calibre/ebooks/conversion/plumber.py | 15 ++ .../ebooks/oeb/transforms/page_margin.py | 153 ++++++++++++++++++ .../gui2/convert/structure_detection.py | 2 +- .../gui2/convert/structure_detection.ui | 13 +- 5 files changed, 183 insertions(+), 5 deletions(-) create mode 100644 src/calibre/ebooks/oeb/transforms/page_margin.py diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 975507e2a7..f1d5d5fe1b 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -49,6 +49,8 @@ HEURISTIC_OPTIONS = ['markup_chapter_headings', 'dehyphenate', 'renumber_headings', 'replace_scene_breaks'] +DEFAULT_TRUE_OPTIONS = HEURISTIC_OPTIONS + ['remove_fake_margins'] + def print_help(parser, log): help = parser.format_help().encode(preferred_encoding, 'replace') log(help) @@ -90,7 +92,7 @@ def option_recommendation_to_cli_option(add_option, rec): if opt.long_switch == 'verbose': attrs['action'] = 'count' attrs.pop('type', '') - if opt.name in HEURISTIC_OPTIONS and rec.recommended_value is True: + if opt.name in DEFAULT_TRUE_OPTIONS and rec.recommended_value is True: switches = ['--disable-'+opt.long_switch] add_option(Option(*switches, **attrs)) @@ -162,6 +164,7 @@ def add_pipeline_options(parser, plumber): 'chapter', 'chapter_mark', 'prefer_metadata_cover', 'remove_first_image', 'insert_metadata', 'page_breaks_before', + 'remove_fake_margins', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 9a0c3f3c7f..6272e7b10b 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -304,6 +304,17 @@ OptionRecommendation(name='page_breaks_before', 'before the specified elements.') ), +OptionRecommendation(name='remove_fake_margins', + recommended_value=True, level=OptionRecommendation.LOW, + help=_('Some documents specify page margins by ' + 'specifying a left and right margin on each individual ' + 'paragraph. calibre will try to detect and remove these ' + 'margins. Sometimes, this can cause the removal of ' + 'margins that should not have been removed. In this ' + 'case you can disable the removal.') + ), + + OptionRecommendation(name='margin_top', recommended_value=5.0, level=OptionRecommendation.LOW, help=_('Set the top margin in pts. Default is %default. ' @@ -988,9 +999,13 @@ OptionRecommendation(name='sr3_replace', page_break_on_body=self.output_plugin.file_type in ('mobi', 'lit')) flattener(self.oeb, self.opts) + self.opts.insert_blank_line = oibl self.opts.remove_paragraph_spacing = orps + from calibre.ebooks.oeb.transforms.page_margin import RemoveFakeMargins + RemoveFakeMargins()(self.oeb, self.log, self.opts) + pr(0.9) self.flush() diff --git a/src/calibre/ebooks/oeb/transforms/page_margin.py b/src/calibre/ebooks/oeb/transforms/page_margin.py new file mode 100644 index 0000000000..589f004dd1 --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/page_margin.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from collections import Counter + +from calibre.ebooks.oeb.base import OEB_STYLES, barename, XPath + +class RemoveFakeMargins(object): + + ''' + Remove left and right margins from paragraph/divs if the same margin is specified + on almost all the elements of at that level. + + Must be called only after CSS flattening + ''' + + def __call__(self, oeb, log, opts): + if not opts.remove_fake_margins: + return + self.oeb, self.log, self.opts = oeb, log, opts + stylesheet = None + self.levels = {} + self.stats = {} + self.selector_map = {} + + for item in self.oeb.manifest: + if item.media_type.lower() in OEB_STYLES: + stylesheet = item + break + if stylesheet is None: + return + + self.log('Removing fake margins...') + + stylesheet = stylesheet.data + + from cssutils.css import CSSRule + for rule in stylesheet.cssRules.rulesOfType(CSSRule.STYLE_RULE): + self.selector_map[rule.selectorList.selectorText] = rule.style + + self.find_levels() + + for level in self.levels: + self.process_level(level) + + def get_margins(self, elem): + cls = elem.get('class', None) + if cls: + style = self.selector_map.get('.'+cls, None) + if style: + return style.marginLeft, style.marginRight, style + return '', '', None + + + def process_level(self, level): + elems = self.levels[level] + self.stats[level+'_left'] = Counter() + self.stats[level+'_right'] = Counter() + + for elem in elems: + lm, rm = self.get_margins(elem)[:2] + self.stats[level+'_left'][lm] += 1 + self.stats[level+'_right'][rm] += 1 + + self.log.debug(level, ' left margin stats:', self.stats[level+'_left']) + self.log.debug(level, ' right margin stats:', self.stats[level+'_right']) + + remove_left = self.analyze_stats(self.stats[level+'_left']) + remove_right = self.analyze_stats(self.stats[level+'_right']) + + + if remove_left: + mcl = self.stats[level+'_left'].most_common(1)[0][0] + self.log('Removing level %s left margin of:'%level, mcl) + + if remove_right: + mcr = self.stats[level+'_right'].most_common(1)[0][0] + self.log('Removing level %s right margin of:'%level, mcr) + + if remove_left or remove_right: + for elem in elems: + lm, rm, style = self.get_margins(elem) + if remove_left and lm == mcl: + style.removeProperty('margin-left') + if remove_right and rm == mcr: + style.removeProperty('margin-right') + + def find_levels(self): + + def level_of(elem, body): + ans = 1 + while elem.getparent() is not body: + ans += 1 + elem = elem.getparent() + return ans + + paras = XPath('descendant::h:p|descendant::h:div') + + for item in self.oeb.spine: + body = XPath('//h:body')(item.data) + if not body: + continue + body = body[0] + + for p in paras(body): + level = level_of(p, body) + level = '%s_%d'%(barename(p.tag), level) + if level not in self.levels: + self.levels[level] = [] + self.levels[level].append(p) + + remove = set() + for k, v in self.levels.iteritems(): + num = len(v) + self.log.debug('Found %d items of level:'%num, k) + level = int(k.split('_')[-1]) + tag = k.split('_')[0] + if tag == 'p' and num < 25: + remove.add(k) + if tag == 'div': + if level > 2 and num < 25: + remove.add(k) + elif level < 3: + # Check each level < 3 element and only keep those + # that have many child paras + for elem in list(v): + children = len(paras(elem)) + if children < 5: + v.remove(elem) + + for k in remove: + self.levels.pop(k) + self.log.debug('Ignoring level', k) + + def analyze_stats(self, stats): + if not stats: + return False + mc = stats.most_common(1) + if len(mc) > 1: + return False + mc = mc[0] + most_common, most_common_count = mc + if not most_common or most_common == '0': + return False + total = sum(stats.values()) + # True if greater than 95% of elements have the same margin + return most_common_count/total > 0.95 diff --git a/src/calibre/gui2/convert/structure_detection.py b/src/calibre/gui2/convert/structure_detection.py index d8e2f4f122..b58c473bd4 100644 --- a/src/calibre/gui2/convert/structure_detection.py +++ b/src/calibre/gui2/convert/structure_detection.py @@ -21,7 +21,7 @@ class StructureDetectionWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, ['chapter', 'chapter_mark', - 'remove_first_image', + 'remove_first_image', 'remove_fake_margins', 'insert_metadata', 'page_breaks_before'] ) self.db, self.book_id = db, book_id diff --git a/src/calibre/gui2/convert/structure_detection.ui b/src/calibre/gui2/convert/structure_detection.ui index f80e6f8182..4ba90c1c2c 100644 --- a/src/calibre/gui2/convert/structure_detection.ui +++ b/src/calibre/gui2/convert/structure_detection.ui @@ -48,10 +48,10 @@ - + - + Qt::Vertical @@ -77,7 +77,7 @@ - + The header and footer removal options have been replaced by the Search & Replace options. Click the Search & Replace category in the bar to the left to use these options. Leave the replace field blank and enter your header/footer removal regexps into the search field. @@ -87,6 +87,13 @@ + + + + Remove &fake margins + + + From 6f3baa43575d0e5e26f9a1ca4f8bbfed06c22cb4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Mar 2011 16:49:49 -0600 Subject: [PATCH 23/26] Caijing Magazine by Eric Chen --- recipes/caijing.recipe | 79 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 recipes/caijing.recipe diff --git a/recipes/caijing.recipe b/recipes/caijing.recipe new file mode 100644 index 0000000000..34e6c1e8a9 --- /dev/null +++ b/recipes/caijing.recipe @@ -0,0 +1,79 @@ +import re +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class Caijing(BasicNewsRecipe): + + title = 'Caijing Magazine' + __author__ = 'Eric Chen' + + description = '''Bi-weekly Finance and Economics Review. Founded in 1998, the fortnightly CAIJING + Magazine has firmly established itself as a news authority and leading voice for + business and financial issues in China. + CAIJING Magazine closely tracks the most important aspects of China's economic reforms, + developments and policy changes, as well as major events in the capital markets. It also + offers a broad international perspective through first-hand reporting on international + political and economic issues. + CAIJING Magazine is China's most widely read business and finance magazine, with a + circulation of 225,000 per issue. It boasts top-level readers from government, business + and academic circles. ''' + language = 'zh' + category = 'news, China' + encoding = 'UTF-8' + timefmt = ' [%a, %d %b, %Y]' + needs_subscription = True + + remove_tags = [dict(attrs={'class':['topad', 'nav', 'searchbox', 'connav', + 'mbx', 'bianji', 'bianji bj', 'lnewlist', 'rdtj', 'loadComment', + 'conr', 'bottom', 'bottomcopyr', 'emaildy', 'rcom', 'allcontent']}), + dict(name=['script', 'noscript', 'style'])] + no_stylesheets = True + remove_javascript = True + current_issue_url = "" + current_issue_cover = "" + + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://service.caijing.com.cn/usermanage/login') + br.select_form(name='mainLoginForm') + br['username'] = self.username + br['password'] = self.password + br.submit() + return br + + def parse_index(self): + articles = [] + soup0 = self.index_to_soup('http://magazine.caijing.com.cn/2011/cjindex2011/') + div = soup0.find('div', attrs={'class':'fmcon'}) + link = div.find('a', href=True) + current_issue_url = link['href'] + + soup = self.index_to_soup(current_issue_url) + + for div_cover in soup.findAll('img', {'src' : re.compile('.')}): + if re.search('\d{4}-\d{2}-\d{2}', div_cover['src']): + self.current_issue_cover = div_cover['src'] + + feeds = [] + for section in soup.findAll('div', attrs={'class':'cebd'}): + section_title = self.tag_to_string(section.find('div', attrs={'class':'ceti'})) + articles = [] + for post in section.findAll('a', href=True): + if re.search('\d{4}-\d{2}-\d{2}', post['href']): + date = re.search('\d{4}-\d{2}-\d{2}', post['href']).group(0) + id = re.search('\d{9}', post['href']).group(0) + url = re.sub(r'\d.*', 'templates/inc/chargecontent2.jsp?id=', post['href']) + url = url + id + '&time=' + date + '&cl=106&page=all' + + title = self.tag_to_string(post) + articles.append({'title':title, 'url':url, 'date':date}) + + if articles: + feeds.append((section_title, articles)) + return feeds + + def get_cover_url(self): + return self.current_issue_cover + From 15fa4f71c4881773f1762aeb3fc9bc4c5ea5200c Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 21 Mar 2011 19:12:54 -0400 Subject: [PATCH 24/26] TXT Input: Textile: Rely on smarty pants to handle quotes. --- src/calibre/ebooks/textile/functions.py | 14 ++------------ src/calibre/ebooks/txt/input.py | 1 + src/calibre/utils/smartypants.py | 6 ++++++ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index 88d27b036d..c3c82ef893 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -211,16 +211,6 @@ class Textile(object): (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign (re.compile(r'(\d+)\'', re.I), r'\1′'), # prime (re.compile(r'(\d+)\"', re.I), r'\1″'), # prime-double - (re.compile(r'(\')\''), r'\1’'), # single closing - following another - (re.compile(r"(\w)\'(\w)"), r'\1’\2'), # apostrophe's - (re.compile(r'(\s)\'(\d+\w?)\b(?!\')'), r'\1’\2'), # back in '88 - (re.compile(r'(\s\[)\''), r'\1‘'), # single opening - following ws+[ - (re.compile(r'(\S)\'(?=\s|'+pnct+'|<|$)', re.M), r'\1’'), # single closing - (re.compile(r'\''), r'‘'), # single opening - (re.compile(r'(\")\"'), r'\1”'), # double closing - following another - (re.compile(r'(\s\[)\"'), r'\1“'), # double opening - following whitespace+[ - (re.compile(r'(\S)\"(?=\s|'+pnct+'|<|$)', re.M), r'\1”'), # double closing - (re.compile(r'"'), r'“'), # double opening (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'\1'), # 3+ uppercase acronym (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'\1'), # 3+ uppercase (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis @@ -870,11 +860,11 @@ class Textile(object): 'hello span strong and bold goodbye' """ qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^') - pnct = ".,\"'?!;:()" + pnct = ".,\"'?!;:" for qtag in qtags: pattern = re.compile(r""" - (?:^|(?<=[\s>%(pnct)s])|\[|([\]}])) + (?:^|(?<=[\s>%(pnct)s\(])|\[|([\]}])) (%(qtag)s)(?!%(qtag)s) (%(c)s) (?::(\S+))? diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 99f7035800..7face4c24f 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -165,6 +165,7 @@ class TXTInput(InputFormatPlugin): elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) + setattr(options, 'smarten_punctuation', True) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) diff --git a/src/calibre/utils/smartypants.py b/src/calibre/utils/smartypants.py index 62845b8d7a..8763a313fc 100644 --- a/src/calibre/utils/smartypants.py +++ b/src/calibre/utils/smartypants.py @@ -584,6 +584,12 @@ def educateQuotes(str): #

He said, "'Quoted' words in a larger quote."

str = re.sub(r""""'(?=\w)""", """“‘""", str) str = re.sub(r"""'"(?=\w)""", """‘“""", str) + str = re.sub(r'''""(?=\w)''', """““""", str) + str = re.sub(r"""''(?=\w)""", """‘‘""", str) + str = re.sub(r'''\"\'''', """”’""", str) + str = re.sub(r'''\'\"''', """’”""", str) + str = re.sub(r'''""''', """””""", str) + str = re.sub(r"""''""", """’’""", str) # Special case for decade abbreviations (the '80s): str = re.sub(r"""\b'(?=\d{2}s)""", r"""’""", str) From d37f302a0e96ec946ed8b78d34732d53dfa1a69a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Mar 2011 22:24:36 -0600 Subject: [PATCH 25/26] ... --- src/calibre/trac/bzr_commit_plugin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/trac/bzr_commit_plugin.py b/src/calibre/trac/bzr_commit_plugin.py index c70e8db703..7e5a1367cb 100644 --- a/src/calibre/trac/bzr_commit_plugin.py +++ b/src/calibre/trac/bzr_commit_plugin.py @@ -60,7 +60,7 @@ class cmd_commit(_cmd_commit): def close_bug(self, bug, action): print 'Closing bug #%s'% bug #nick = config.get_nickname() - suffix = ('The fix will be in the next release.' + suffix = ('The fix will be in the next release. ' 'calibre is usually released every Friday.') action = action+'ed' msg = '%s in branch %s. %s'%(action, 'lp:calibre', suffix) @@ -72,7 +72,7 @@ class cmd_commit(_cmd_commit): sendmail = importlib.import_module(SENDMAIL[1]) to = bug+'@bugs.launchpad.net' - sendmail.sendmail(msg, to, 'Re: calibre bug '+bug) + sendmail.sendmail(msg, to, 'Fixed in lp:calibre') bzrlib.commands.register_command(cmd_commit) From 74d1fb4c4912ef93a90ed1622188d76c0b58d56d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Mar 2011 22:26:21 -0600 Subject: [PATCH 26/26] Initial implementation of relevance sorting of metadata identify results. Needs testing --- src/calibre/ebooks/metadata/sources/amazon.py | 37 ++++-- src/calibre/ebooks/metadata/sources/base.py | 105 ++++++++++++++++++ src/calibre/ebooks/metadata/sources/google.py | 5 +- src/calibre/ebooks/metadata/sources/test.py | 14 +-- 4 files changed, 137 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index b99893ccba..9460ed7ace 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -28,11 +28,12 @@ class Worker(Thread): # {{{ Get book details from amazons book page in a separate thread ''' - def __init__(self, url, result_queue, browser, log, timeout=20): + def __init__(self, url, result_queue, browser, log, relevance, plugin, timeout=20): Thread.__init__(self) self.daemon = True self.url, self.result_queue = url, result_queue self.log, self.timeout = log, timeout + self.relevance, self.plugin = relevance, plugin self.browser = browser.clone_browser() self.cover_url = self.amazon_id = self.isbn = None @@ -161,6 +162,15 @@ class Worker(Thread): # {{{ else: self.log.warning('Failed to find product description for url: %r'%self.url) + mi.source_relevance = self.relevance + + if self.amazon_id: + if self.isbn: + self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) + if self.cover_url: + self.cache_identifier_to_cover_url(self.amazon_id, + self.cover_url) + self.result_queue.put(mi) def parse_asin(self, root): @@ -321,6 +331,20 @@ class Amazon(Source): # }}} + def get_cached_cover_url(self, identifiers): + url = None + asin = identifiers.get('amazon', None) + if asin is None: + asin = identifiers.get('asin', None) + if asin is None: + isbn = identifiers.get('isbn', None) + if isbn is not None: + asin = self.cached_isbn_to_identifier(isbn) + if asin is not None: + url = self.cached_identifier_to_cover_url(asin) + + return url + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): ''' @@ -396,7 +420,8 @@ class Amazon(Source): log.error('No matches found with query: %r'%query) return - workers = [Worker(url, result_queue, br, log) for url in matches] + workers = [Worker(url, result_queue, br, log, i, self) for i, url in + enumerate(matches)] for w in workers: w.start() @@ -414,14 +439,6 @@ class Amazon(Source): if not a_worker_is_alive: break - for w in workers: - if w.amazon_id: - if w.isbn: - self.cache_isbn_to_identifier(w.isbn, w.amazon_id) - if w.cover_url: - self.cache_identifier_to_cover_url(w.amazon_id, - w.cover_url) - return None # }}} diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 55cc996cf7..90d7f82d65 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -21,6 +21,21 @@ def create_log(ostream=None): log.outputs = [FileStream(ostream)] return log +words = ("the", "a", "an", "of", "and") +prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words))) +trailing_paren_pat = re.compile(r'\(.*\)$') +whitespace_pat = re.compile(r'\s+') + +def cleanup_title(s): + if not s: + s = _('Unknown') + s = s.strip().lower() + s = prefix_pat.sub(' ', s) + s = trailing_paren_pat.sub('', s) + s = whitespace_pat.sub(' ', s) + return s.strip() + + class Source(Plugin): type = _('Metadata source') @@ -128,10 +143,91 @@ class Source(Plugin): gr.append(job) return [g for g in groups if g] + def test_fields(self, mi): + ''' + Return the first field from self.touched_fields that is null on the + mi object + ''' + for key in self.touched_fields: + if key.startswith('identifier:'): + key = key.partition(':')[-1] + if not mi.has_identifier(key): + return 'identifier: ' + key + elif mi.is_null(key): + return key + + # }}} # Metadata API {{{ + def get_cached_cover_url(self, identifiers): + ''' + Return cached cover URL for the book identified by + the identifiers dict or Noneif no such URL exists + ''' + return None + + def compare_identify_results(self, x, y, title=None, authors=None, + identifiers={}): + ''' + Method used to sort the results from a call to identify by relevance. + Uses the actual query and various heuristics to rank results. + Re-implement in your plugin if this generic algorithm is not suitable. + Note that this method assumes x and y have a source_relevance + attribute. + + one < two iff one is more relevant than two + ''' + # First, guarantee that if the query specifies an ISBN, the result with + # the same isbn is the most relevant + def isbn_test(mi): + return mi.isbn and mi.isbn == identifiers.get('isbn', None) + + def boolcmp(a, b): + return -1 if a and not b else 1 if not a and b else 0 + + x_has_isbn, y_has_isbn = isbn_test(x), isbn_test(y) + result = boolcmp(x_has_isbn, y_has_isbn) + if result != 0: + return result + + # Now prefer results that have complete metadata over those that don't + x_has_all_fields = self.test_fields(x) is None + y_has_all_fields = self.test_fields(y) is None + + result = boolcmp(x_has_all_fields, y_has_all_fields) + if result != 0: + return result + + # Now prefer results whose title matches the search query + if title: + x_title = cleanup_title(x.title) + y_title = cleanup_title(y.title) + t = cleanup_title(title) + x_has_title, y_has_title = x_title == t, y_title == t + result = boolcmp(x_has_title, y_has_title) + if result != 0: + return result + + # Now prefer results with the longer comments, within 10% + cx = len(x.comments.strip() if x.comments else '') + cy = len(y.comments.strip() if y.comments else '') + t = (cx + cy) / 20 + result = cy - cx + if result != 0 and abs(cx - cy) > t: + return result + + # Now prefer results with cached cover URLs + x_has_cover = self.get_cached_cover_url(x.identifiers) is not None + y_has_cover = self.get_cached_cover_url(y.identifiers) is not None + result = boolcmp(x_has_cover, y_has_cover) + if result != 0: + return result + + # Now use the relevance reported by the remote search engine + return x.source_relevance - y.source_relevance + def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=5): ''' @@ -147,6 +243,15 @@ class Source(Plugin): the same ISBN/special identifier does not need to get the cover URL again. Use the caching API for this. + Every Metadata object put into result_queue by this method must have a + `source_relevance` attribute that is an integer indicating the order in + which the results were returned by the metadata source for this query. + This integer will be used by :meth:`compare_identify_results`. If the + order is unimportant, set it to zero for every result. + + Make sure that any cover/isbn mapping information is cached before the + Metadata object is put into result_queue. + :param log: A log object, use it to output debugging information/errors :param result_queue: A result Queue, results should be put into it. Each result is a Metadata object diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index c44ad81b6c..b7298c0099 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -190,14 +190,15 @@ class GoogleBooks(Source): return raw and len(raw) > 17000 and raw[1:4] != 'PNG' def get_all_details(self, br, log, entries, abort, result_queue, timeout): - for i in entries: + for relevance, i in enumerate(entries): try: ans = to_metadata(br, log, i, timeout) if isinstance(ans, Metadata): - result_queue.put(ans) + ans.source_relevance = relevance for isbn in getattr(ans, 'all_isbns', []): self.cache_isbn_to_identifier(isbn, ans.identifiers['google']) + result_queue.put(ans) except: log.exception( 'Failed to get metadata for identify entry:', diff --git a/src/calibre/ebooks/metadata/sources/test.py b/src/calibre/ebooks/metadata/sources/test.py index 2af9a47078..032041ef29 100644 --- a/src/calibre/ebooks/metadata/sources/test.py +++ b/src/calibre/ebooks/metadata/sources/test.py @@ -46,15 +46,6 @@ def authors_test(authors): return test -def _test_fields(touched_fields, mi): - for key in touched_fields: - if key.startswith('identifier:'): - key = key.partition(':')[-1] - if not mi.has_identifier(key): - return 'identifier: ' + key - elif mi.is_null(key): - return key - def test_identify_plugin(name, tests): ''' @@ -120,11 +111,10 @@ def test_identify_plugin(name, tests): prints('Log saved to', lf) raise SystemExit(1) - good = [x for x in possibles if _test_fields(plugin.touched_fields, x) is + good = [x for x in possibles if plugin.test_fields(x) is None] if not good: - prints('Failed to find', _test_fields(plugin.touched_fields, - possibles[0])) + prints('Failed to find', plugin.test_fields(possibles[0])) raise SystemExit(1)