diff --git a/resources/images/document-encrypt.png b/resources/images/document-encrypt.png new file mode 100644 index 0000000000..0774342024 Binary files /dev/null and b/resources/images/document-encrypt.png differ diff --git a/resources/recipes/globe_and_mail.recipe b/resources/recipes/globe_and_mail.recipe index 4cc76688c1..22cb6fa5bb 100644 --- a/resources/recipes/globe_and_mail.recipe +++ b/resources/recipes/globe_and_mail.recipe @@ -8,12 +8,13 @@ __docformat__ = 'restructuredtext en' globeandmail.com ''' +import re + from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1287083651(BasicNewsRecipe): title = u'Globe & Mail' - __license__ = 'GPL v3' - __author__ = 'Szing' + __author__ = 'Kovid Goyal' oldest_article = 2 no_stylesheets = True max_articles_per_feed = 100 @@ -38,24 +39,19 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe): (u'Sports', u'http://www.theglobeandmail.com/auto/?service=rss') ] - keep_only_tags = [ - dict(name='h1'), - dict(name='h2', attrs={'id':'articletitle'}), - dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}), - dict(name='div', attrs={'class':['news','articlemeta','articlecopy']}), - dict(name='id', attrs={'class':'article'}), - dict(name='table', attrs={'class':'todays-market'}), - dict(name='header', attrs={'id':'leadheader'}) - ] + preprocess_regexps = [ + (re.compile(r'', re.DOTALL), lambda m: ''), + (re.compile(r'', re.DOTALL), lambda m: ''), + ] + remove_tags_before = dict(name='h1') remove_tags = [ - dict(name='div', attrs={'id':['tabInside', 'ShareArticles', 'topStories']}) - ] - - #this has to be here or the text in the article appears twice. - remove_tags_after = [dict(id='article')] + dict(name='div', attrs={'id':['ShareArticles', 'topStories']}), + dict(href=lambda x: x and 'tracking=' in x), + {'class':['articleTools', 'pagination', 'Ads', 'topad', + 'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}] #Use the mobile version rather than the web version def print_version(self, url): - return url + '&service=mobile' + return url.rpartition('?')[0] + '?service=mobile' diff --git a/resources/recipes/msnbc.recipe b/resources/recipes/msnbc.recipe index f093479e2f..6e58585341 100644 --- a/resources/recipes/msnbc.recipe +++ b/resources/recipes/msnbc.recipe @@ -4,7 +4,6 @@ __copyright__ = '2010-2011, Darko Miletic ' msnbc.msn.com ''' -import re from calibre.web.feeds.recipes import BasicNewsRecipe class MsNBC(BasicNewsRecipe): @@ -19,7 +18,7 @@ class MsNBC(BasicNewsRecipe): publisher = 'msnbc.com' category = 'news, USA, world' language = 'en' - extra_css = """ + extra_css = """ body{ font-family: Georgia,Times,serif } .hide{display: none} .caption{font-family: Arial,sans-serif; font-size: x-small} @@ -44,7 +43,7 @@ class MsNBC(BasicNewsRecipe): ,dict(attrs={'class':['gl_headline','articleText','drawer-content Linear','v-center3','byline','textBodyBlack']}) ] remove_attributes=['property','lang','rel','xmlns:fb','xmlns:v','xmlns:dc','xmlns:dcmitype','xmlns:og','xmlns:media','xmlns:vcard','typeof','itemscope','itemtype','itemprop','about','type','size','width','height','onreadystatechange','data','border','hspace','vspace'] - + remove_tags = [ dict(name=['iframe','object','link','embed','meta','table']) ,dict(name='span', attrs={'class':['copyright','Linear copyright']}) @@ -70,7 +69,7 @@ class MsNBC(BasicNewsRecipe): if item.has_key('id') and item['id'].startswith('vine-'): item.extract() if item.has_key('class') and ( item['class'].startswith('ad') or item['class'].startswith('vine')): - item.extract() + item.extract() for item in soup.body.findAll('img'): if not item.has_key('alt'): item['alt'] = 'image' @@ -83,6 +82,6 @@ class MsNBC(BasicNewsRecipe): for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string - alink.replaceWith(tstr) + alink.replaceWith(tstr) return soup diff --git a/resources/templates/rtf.xsl b/resources/templates/rtf.xsl index ea1fc71172..6db1c0388d 100644 --- a/resources/templates/rtf.xsl +++ b/resources/templates/rtf.xsl @@ -287,7 +287,7 @@ ] - + @@ -297,7 +297,7 @@ - + diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 2585b5d081..a4f7439405 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -459,6 +459,18 @@ def force_unicode(obj, enc=preferred_encoding): obj = obj.decode('utf-8') return obj +def as_unicode(obj, enc=preferred_encoding): + if not isbytestring(obj): + try: + obj = unicode(obj) + except: + try: + obj = str(obj) + except: + obj = repr(obj) + return force_unicode(obj, enc=enc) + + def human_readable(size): """ Convert a size in bytes into a human readable form """ diff --git a/src/calibre/ebooks/metadata/rtf.py b/src/calibre/ebooks/metadata/rtf.py index ad41125575..c20d880a2f 100644 --- a/src/calibre/ebooks/metadata/rtf.py +++ b/src/calibre/ebooks/metadata/rtf.py @@ -10,7 +10,8 @@ from calibre.ebooks.metadata import MetaInformation, string_to_authors title_pat = re.compile(r'\{\\info.*?\{\\title(.*?)(? 6: - md += '}' + md.append(r'{\subject %s}'%(comment,)) + if options.publisher: + publisher = options.publisher.encode('ascii', 'ignore') + md.append(r'{\manager %s}'%(publisher,)) + if options.tags: + tags = u', '.join(options.tags) + tags = tags.encode('ascii', 'ignore') + md.append(r'{\category %s}'%(tags,)) + if len(md) > 1: + md.append('}') stream.seek(0) src = stream.read() - ans = src[:6] + md + src[6:] + ans = src[:6] + u''.join(md) + src[6:] stream.seek(0) stream.write(ans) @@ -156,7 +169,7 @@ def set_metadata(stream, options): base_pat = r'\{\\name(.*?)(? 0: - if raw[pos] == '}': bc -= 1 - elif raw[pos] == '{': bc += 1 - pos += 1 - pict = raw[start:pos+1] - enc = re.sub(r'[^a-zA-Z0-9]', '', pict) + for enc in encs: if len(enc) % 2 == 1: enc = enc[:-1] data = enc.decode('hex') + fmt = imghdr.what(None, data) + if fmt is None: + fmt = 'wmf' count += 1 - name = (('%4d'%count).replace(' ', '0'))+'.wmf' - open(name, 'wb').write(data) + name = '%04d.%s' % (count, fmt) + with open(name, 'wb') as f: + f.write(data) imap[count] = name #open(name+'.hex', 'wb').write(enc) return self.convert_images(imap) def convert_images(self, imap): - for count, val in imap.items(): + self.default_img = None + for count, val in imap.iteritems(): try: imap[count] = self.convert_image(val) except: @@ -159,6 +169,8 @@ class RTFInput(InputFormatPlugin): return imap def convert_image(self, name): + if not name.endswith('.wmf'): + return name try: return self.rasterize_wmf(name) except: @@ -167,21 +179,22 @@ class RTFInput(InputFormatPlugin): def replace_wmf(self, name): from calibre.ebooks import calibre_cover - data = calibre_cover('Conversion of WMF images is not supported', + if self.default_img is None: + self.default_img = calibre_cover('Conversion of WMF images is not supported', 'Use Microsoft Word or OpenOffice to save this RTF file' ' as HTML and convert that in calibre.', title_size=36, author_size=20) name = name.replace('.wmf', '.jpg') with open(name, 'wb') as f: - f.write(data) + f.write(self.default_img) return name def rasterize_wmf(self, name): - from calibre.utils.wmf import extract_raster_image + from calibre.utils.wmf.parse import wmf_unwrap with open(name, 'rb') as f: data = f.read() - data = extract_raster_image(data) - name = name.replace('.wmf', '.jpg') + data = wmf_unwrap(data) + name = name.replace('.wmf', '.png') with open(name, 'wb') as f: f.write(data) return name @@ -212,27 +225,27 @@ class RTFInput(InputFormatPlugin): css += '\n'+'\n'.join(font_size_classes) css += '\n' +'\n'.join(color_classes) - for cls, val in border_styles.items(): + for cls, val in border_styles.iteritems(): css += '\n\n.%s {\n%s\n}'%(cls, val) with open('styles.css', 'ab') as f: f.write(css) - def preprocess(self, fname): - self.log('\tPreprocessing to convert unicode characters') - try: - data = open(fname, 'rb').read() - from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser - tokenizer = RtfTokenizer(data) - tokens = RtfTokenParser(tokenizer.tokens) - data = tokens.toRTF() - fname = 'preprocessed.rtf' - with open(fname, 'wb') as f: - f.write(data) - except: - self.log.exception( - 'Failed to preprocess RTF to convert unicode sequences, ignoring...') - return fname + # def preprocess(self, fname): + # self.log('\tPreprocessing to convert unicode characters') + # try: + # data = open(fname, 'rb').read() + # from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser + # tokenizer = RtfTokenizer(data) + # tokens = RtfTokenParser(tokenizer.tokens) + # data = tokens.toRTF() + # fname = 'preprocessed.rtf' + # with open(fname, 'wb') as f: + # f.write(data) + # except: + # self.log.exception( + # 'Failed to preprocess RTF to convert unicode sequences, ignoring...') + # return fname def convert_borders(self, doc): border_styles = [] @@ -269,17 +282,14 @@ class RTFInput(InputFormatPlugin): self.log = log self.log('Converting RTF to XML...') #Name of the preprocesssed RTF file - fname = self.preprocess(stream.name) + # fname = self.preprocess(stream.name) try: - xml = self.generate_xml(fname) + xml = self.generate_xml(stream.name) except RtfInvalidCodeException, e: + raise raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.\n%s')%e) - '''dataxml = open('dataxml.xml', 'w') - dataxml.write(xml) - dataxml.close''' - d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: imap = {} diff --git a/src/calibre/ebooks/rtf2xml/ParseRtf.py b/src/calibre/ebooks/rtf2xml/ParseRtf.py index 7b89407f79..cdd9a3d088 100755 --- a/src/calibre/ebooks/rtf2xml/ParseRtf.py +++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py @@ -17,7 +17,8 @@ ######################################################################### # $Revision: 1.41 $ # $Date: 2006/03/24 23:50:07 $ -import sys,os +import sys, os + from calibre.ebooks.rtf2xml import headings_to_sections, \ line_endings, footnote, fields_small, default_encoding, \ make_lists, preamble_div, header, colors, group_borders, \ @@ -90,7 +91,6 @@ class ParseRtf: out_file = '', out_dir = None, dtd = '', - #debug = 0, #why? calibre deb_dir = None, convert_symbol = None, convert_wingdings = None, @@ -107,6 +107,7 @@ class ParseRtf: no_dtd = 0, char_data = '', ): + """ Requires: 'file' --file to parse @@ -119,12 +120,11 @@ class ParseRtf: script tries to output to directory where is script is exectued.) 'deb_dir' --debug directory. If a debug_dir is provided, the script will copy each run through as a file to examine in the debug_dir - 'perl_script'--use perl to make tokens. This runs just a bit faster. - (I will probably phase this out.) 'check_brackets' -- make sure the brackets match up after each run through a file. Only for debugging. Returns: Nothing """ + self.__file = in_file self.__out_file = out_file self.__out_dir = out_dir @@ -132,7 +132,7 @@ class ParseRtf: self.__dtd_path = dtd self.__check_file(in_file,"file_to_parse") self.__char_data = char_data - self.__debug_dir = deb_dir #self.__debug_dir = debug calibre + self.__debug_dir = deb_dir self.__check_dir(self.__temp_dir) self.__copy = self.__check_dir(self.__debug_dir) self.__convert_caps = convert_caps @@ -155,25 +155,24 @@ class ParseRtf: if hasattr(the_file, 'read'): return if the_file == None: if type == "file_to_parse": - message = "You must provide a file for the script to work" - msg = message + msg = "\nYou must provide a file for the script to work" raise RtfInvalidCodeException, msg elif os.path.exists(the_file): pass # do nothing else: - message = "The file '%s' cannot be found" % the_file - msg = message + msg = "\nThe file '%s' cannot be found" % the_file raise RtfInvalidCodeException, msg + def __check_dir(self, the_dir): """Check to see if directory exists""" if not the_dir : return dir_exists = os.path.isdir(the_dir) if not dir_exists: - message = "%s is not a directory" % the_dir - msg = message + msg = "\n%s is not a directory" % the_dir raise RtfInvalidCodeException, msg return 1 + def parse_rtf(self): """ Parse the file by calling on other classes. @@ -194,13 +193,14 @@ class ParseRtf: copy_obj.set_dir(self.__debug_dir) copy_obj.remove_files() copy_obj.copy_file(self.__temp_file, "original_file") - # new as of 2005-08-02. Do I want this? + # Function to check if bracket are well handled if self.__debug_dir or self.__run_level > 2: self.__check_brack_obj = check_brackets.CheckBrackets\ (file = self.__temp_file, bug_handler = RtfInvalidCodeException, ) - # convert Macintosh line endings to Unix line endings + #convert Macintosh and Windows line endings to Unix line endings + #why do this if you don't wb after? line_obj = line_endings.FixLineEndings( in_file = self.__temp_file, bug_handler = RtfInvalidCodeException, @@ -208,13 +208,13 @@ class ParseRtf: run_level = self.__run_level, replace_illegals = self.__replace_illegals, ) - return_value = line_obj.fix_endings() + return_value = line_obj.fix_endings() #calibre return what? self.__return_code(return_value) tokenize_obj = tokenize.Tokenize( bug_handler = RtfInvalidCodeException, in_file = self.__temp_file, copy = self.__copy, - run_level = self.__run_level,) + run_level = self.__run_level) tokenize_obj.tokenize() process_tokens_obj = process_tokens.ProcessTokens( in_file = self.__temp_file, @@ -230,12 +230,25 @@ class ParseRtf: os.remove(self.__temp_file) except OSError: pass + #Check to see if the file is correctly encoded + encode_obj = default_encoding.DefaultEncoding( + in_file = self.__temp_file, + run_level = self.__run_level, + bug_handler = RtfInvalidCodeException, + check_raw = True, + ) + platform, code_page, default_font_num = encode_obj.find_default_encoding() check_encoding_obj = check_encoding.CheckEncoding( - bug_handler = RtfInvalidCodeException, - ) - check_encoding_obj.check_encoding(self.__file) - sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8')) - raise InvalidRtfException, msg + bug_handler = RtfInvalidCodeException, + ) + enc = encode_obj.get_codepage() + if enc != 'mac_roman': + enc = 'cp' + enc + if check_encoding_obj.check_encoding(self.__file, enc): + file_name = self.__file if isinstance(self.__file, str) \ + else self.__file.encode('utf-8') + msg = 'File %s does not appear to be correctly encoded.\n' % file_name + raise InvalidRtfException, msg delete_info_obj = delete_info.DeleteInfo( in_file = self.__temp_file, copy = self.__copy, @@ -508,6 +521,7 @@ class ParseRtf: indent = self.__indent, run_level = self.__run_level, no_dtd = self.__no_dtd, + encoding = encode_obj.get_codepage(), bug_handler = RtfInvalidCodeException, ) tags_obj.convert_to_tags() @@ -520,35 +534,28 @@ class ParseRtf: output_obj.output() os.remove(self.__temp_file) return self.__exit_level + def __bracket_match(self, file_name): if self.__run_level > 2: good_br, msg = self.__check_brack_obj.check_brackets() if good_br: pass - # sys.stderr.write( msg + ' in ' + file_name + "\n") + #sys.stderr.write( msg + ' in ' + file_name + "\n") else: - msg += msg + " in file '" + file_name + "'\n" + msg = '%s in file %s\n' % (msg, file_name) raise RtfInvalidCodeException, msg + def __return_code(self, num): - if num == None: - return - if int(num) > self.__exit_level: - self.__exit_level = num + if num == None: + return + if int(num) > self.__exit_level: + self.__exit_level = num + def __make_temp_file(self,file): """Make a temporary file to parse""" write_file="rtf_write_file" read_obj = file if hasattr(file, 'read') else open(file,'r') - write_obj = open(write_file, 'w') - line = "dummy" - while line: - line = read_obj.read(1000) - write_obj.write(line ) - write_obj.close() + with open(write_file, 'wb') as write_obj: + for line in read_obj: + write_obj.write(line) return write_file - """ -mi1\n -mi33\n -mi' % info) + def __empty_func(self, line): """ Print out empty tag and newlines when needed. @@ -85,10 +96,11 @@ class ConvertToTags: self.__write_new_line() if info in self.__two_new_line: self.__write_extra_new_line() + def __open_att_func(self, line): """ Process lines for open tags that have attributes. - The important infor is between [17:-1]. Take this info and split it + The important info is between [17:-1]. Take this info and split it with the delimeter '<'. The first token in this group is the element name. The rest are attributes, separated fromt their values by '>'. So read each token one at a time, and split them by '>'. @@ -119,6 +131,7 @@ class ConvertToTags: self.__write_new_line() if element_name in self.__two_new_line: self.__write_extra_new_line() + def __empty_att_func(self, line): """ Same as the __open_att_func, except a '/' is placed at the end of the tag. @@ -143,6 +156,7 @@ class ConvertToTags: self.__write_new_line() if element_name in self.__two_new_line: self.__write_extra_new_line() + def __close_func(self, line): """ Print out the closed tag and new lines, if appropriate. @@ -156,6 +170,7 @@ class ConvertToTags: self.__write_new_line() if info in self.__two_new_line: self.__write_extra_new_line() + def __text_func(self, line): """ Simply print out the information between [17:-1] @@ -163,6 +178,7 @@ class ConvertToTags: #tx') + #keep maximum compatibility with previous version + check_encoding_obj = check_encoding.CheckEncoding( + bug_handler=self.__bug_handler) + + if not check_encoding_obj.check_encoding(self.__file, verbose=False): + self.__write_obj.write('') + elif not check_encoding_obj.check_encoding(self.__file, self.__encoding): + self.__write_obj.write('' % self.__encoding) + else: + self.__write_obj.write('') + sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and' + ' hope for the best') self.__new_line = 0 self.__write_new_line() if self.__no_dtd: @@ -207,6 +237,7 @@ class ConvertToTags: ) self.__new_line = 0 self.__write_new_line() + def convert_to_tags(self): """ Read in the file one line at a time. Get the important info, between @@ -222,18 +253,14 @@ class ConvertToTags: an empty tag function. """ self.__initiate_values() - read_obj = open(self.__file, 'r') self.__write_obj = open(self.__write_to, 'w') self.__write_dec() - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - action = self.__state_dict.get(self.__token_info) - if action != None: - action(line) - read_obj.close() + with open(self.__file, 'r') as read_obj: + for line in read_obj: + self.__token_info = line[:16] + action = self.__state_dict.get(self.__token_info) + if action is not None: + action(line) self.__write_obj.close() copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: diff --git a/src/calibre/ebooks/rtf2xml/copy.py b/src/calibre/ebooks/rtf2xml/copy.py index ff029c1841..1b620b9fbf 100755 --- a/src/calibre/ebooks/rtf2xml/copy.py +++ b/src/calibre/ebooks/rtf2xml/copy.py @@ -23,6 +23,7 @@ class Copy: def __init__(self, bug_handler, file = None, deb_dir = None, ): self.__file = file self.__bug_handler = bug_handler + def set_dir(self, deb_dir): """Set the temporary directory to write files to""" if deb_dir is None: @@ -33,19 +34,11 @@ class Copy: message = "%(deb_dir)s is not a directory" % vars() raise self.__bug_handler , message Copy.__dir = deb_dir + def remove_files(self ): """Remove files from directory""" self.__remove_the_files(Copy.__dir) - """ - list_of_files = os.listdir(Copy.__dir) - list_of_files = os.listdir(the_dir) - for file in list_of_files: - rem_file = os.path.join(Copy.__dir,file) - if os.path.isdir(rem_file): - self.remove_files(rem_file) - else: - os.remove(rem_file) - """ + def __remove_the_files(self, the_dir): """Remove files from directory""" list_of_files = os.listdir(the_dir) @@ -58,6 +51,7 @@ class Copy: os.remove(rem_file) except OSError: pass + def copy_file(self, file, new_file): """ Copy the file to a new name diff --git a/src/calibre/ebooks/rtf2xml/default_encoding.py b/src/calibre/ebooks/rtf2xml/default_encoding.py index b932b465d0..53887e0d90 100755 --- a/src/calibre/ebooks/rtf2xml/default_encoding.py +++ b/src/calibre/ebooks/rtf2xml/default_encoding.py @@ -1,61 +1,142 @@ ######################################################################### # # -# # # copyright 2002 Paul Henry Tremblay # # # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # -# General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, write to the Free Software # -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA # -# 02111-1307 USA # -# # -# # ######################################################################### + +''' +Codepages as to RTF 1.9.1: + 437 United States IBM + 708 Arabic (ASMO 708) + 709 Arabic (ASMO 449+, BCON V4) + 710 Arabic (transparent Arabic) + 711 Arabic (Nafitha Enhanced) + 720 Arabic (transparent ASMO) + 819 Windows 3.1 (United States and Western Europe) + 850 IBM multilingual + 852 Eastern European + 860 Portuguese + 862 Hebrew + 863 French Canadian + 864 Arabic + 865 Norwegian + 866 Soviet Union + 874 Thai + 932 Japanese + 936 Simplified Chinese + 949 Korean + 950 Traditional Chinese + 1250 Eastern European + 1251 Cyrillic + 1252 Western European + 1253 Greek + 1254 Turkish + 1255 Hebrew + 1256 Arabic + 1257 Baltic + 1258 Vietnamese + 1361 Johab + 10000 MAC Roman + 10001 MAC Japan + 10004 MAC Arabic + 10005 MAC Hebrew + 10006 MAC Greek + 10007 MAC Cyrillic + 10029 MAC Latin2 + 10081 MAC Turkish + 57002 Devanagari + 57003 Bengali + 57004 Tamil + 57005 Telugu + 57006 Assamese + 57007 Oriya + 57008 Kannada + 57009 Malayalam + 57010 Gujarati + 57011 Punjabi +''' +import re + class DefaultEncoding: """ Find the default encoding for the doc """ - def __init__(self, in_file, bug_handler, run_level = 1,): - """ - Required: - 'file' - Returns: - nothing - """ + def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False): self.__file = in_file self.__bug_handler = bug_handler + self.__platform = 'Windows' + self.__default_num = 'not-defined' + self.__code_page = '1252' + self.__datafetched = False + self.__fetchraw = check_raw + def find_default_encoding(self): - platform = 'Windows' - default_num = 'not-defined' - code_page = 'ansicpg1252' - read_obj = open(self.__file, 'r') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - self.__token_info = line[:16] - if self.__token_info == 'mi 3: msg = 'flag problem\n' raise self.__bug_handler, msg - return 1 + return True elif self.__token_info in self.__allowable : if self.__ob: self.__write_obj.write(self.__ob) @@ -132,85 +138,81 @@ class DeleteInfo: self.__state = 'default' else: pass - return 1 + return True elif self.__token_info == 'cw 5: - msg = 'After an asterisk, and found neither an allowable or non-allowble token\n' - msg += 'token is "%s"\n' % self.__token_info - raise self.__bug_handler + msg = ('After an asterisk, and found neither an allowable or non-allowable token\n\ + token is "%s"\n') % self.__token_info + raise self.__bug_handler, msg if not self.__ob: - self.__write_cb = 1 + self.__write_cb = True self.__ob = 0 self.__state = 'delete' self.__cb_count = 0 - return 0 + return False + def __found_list_func(self, line): """ print out control words in this group """ self.__state = 'list' + def __list_func(self, line): """ Check to see if the group has ended. - Return 1 for all control words. - Return 0 otherwise. + Return True for all control words. + Return False otherwise. """ if self.__delete_count == self.__cb_count and self.__token_info ==\ 'cb%s\n' % self.__footnote_count) self.__first_line = 0 + def __in_footnote_func(self, line): """Handle all tokens that are part of footnote""" if self.__first_line: @@ -68,6 +72,7 @@ class Footnote: 'mi ci - 'annotation' : 'annotation', + 'annotation' : 'annotation', 'blue______' : 'blue', 'bold______' : 'bold', - 'caps______' : 'caps', - 'char-style' : 'character-style', - 'dbl-strike' : 'double-strike-through', + 'caps______' : 'caps', + 'char-style' : 'character-style', + 'dbl-strike' : 'double-strike-through', 'emboss____' : 'emboss', 'engrave___' : 'engrave', 'font-color' : 'font-color', @@ -96,7 +97,7 @@ class Inline: 'font-size_' : 'font-size', 'font-style' : 'font-style', 'font-up___' : 'superscript', - 'footnot-mk' : 'footnote-marker', + 'footnot-mk' : 'footnote-marker', 'green_____' : 'green', 'hidden____' : 'hidden', 'italics___' : 'italics', @@ -107,9 +108,10 @@ class Inline: 'strike-thr' : 'strike-through', 'subscript_' : 'subscript', 'superscrip' : 'superscript', - 'underlined' : 'underlined', + 'underlined' : 'underlined', } self.__caps_list = ['false'] + def __set_list_func(self, line): """ Requires: @@ -128,6 +130,7 @@ class Inline: self.__place = 'in_list' self.__inline_list = self.__list_inline_list self.__groups_in_waiting = self.__groups_in_waiting_list + def __default_func(self, line): """ Requires: @@ -140,8 +143,8 @@ class Inline: action = self.__default_dict.get(self.__token_info) if action: action(line) - if self.__token_info != 'cw%s' % (the_key, the_dict[the_key])) self.__write_obj.write('\n') self.__groups_in_waiting[0] = 0 + def __end_para_func(self, line): """ Requires: @@ -342,6 +346,7 @@ class Inline: self.__write_obj.write('mi%s' % (the_key, the_dict[the_key])) self.__write_obj.write('\n') self.__groups_in_waiting[0] = 0 + def __found_field_func(self, line): """ Just a default function to make sure I don't prematurely exit default state """ pass + def form_tags(self): """ Requires: @@ -386,32 +393,27 @@ class Inline: the state. """ self.__initiate_values() - read_obj = open(self.__file, 'r') - self.__write_obj = open(self.__write_to, 'w') - line_to_read = 1 - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - token = line[0:-1] - self.__token_info = '' - if token == 'tx 1: - sys.stderr.write('Removing files from old pict directory...\n') - all_files = os.listdir(self.__dir_name) - for the_file in all_files: - the_file = os.path.join(self.__dir_name, the_file) - try: - os.remove(the_file) - except OSError: - pass - if self.__run_level > 1: - sys.stderr.write('Files removed.\n') + if self.__run_level > 1: + sys.stderr.write('Removing files from old pict directory...\n') + all_files = os.listdir(self.__dir_name) + for the_file in all_files: + the_file = os.path.join(self.__dir_name, the_file) + try: + os.remove(the_file) + except OSError: + pass + if self.__run_level > 1: + sys.stderr.write('Files removed.\n') def __create_pict_file(self): """Create a file for all the pict data to be written to. """ self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf') - write_pic_obj = open(self.__pict_file, 'w') - write_pic_obj.close() self.__write_pic_obj = open(self.__pict_file, 'a') def __in_pict_func(self, line): if self.__cb_count == self.__pict_br_count: - self.__in_pict = 0 + self.__in_pict = False self.__write_pic_obj.write("}\n") - return 1 + return True else: action = self.__pict_dict.get(self.__token_info) if action: - line = action(line) - self.__write_pic_obj.write(line) - return 0 + self.__write_pic_obj.write(action(line)) + return False def __default(self, line, write_obj): """Determine if each token marks the beginning of pict data. @@ -142,53 +128,50 @@ class Pict: write_obj.write('mi ml '*' : ('ml', 'asterisk__', self.default_func), ':' : ('ml', 'colon_____', self.default_func), @@ -73,7 +78,6 @@ class ProcessTokens: 'backslash' : ('nu', '\\', self.text_func), 'ob' : ('nu', '{', self.text_func), 'cb' : ('nu', '}', self.text_func), - 'line' : ('nu', 'hard-lineb', self.default_func), #calibre #'line' : ('nu', ' ', self.text_func), calibre # paragraph formatting => pf 'page' : ('pf', 'page-break', self.default_func), @@ -159,15 +163,17 @@ class ProcessTokens: 'rtf' : ('ri', 'rtf_______', self.default_func), 'deff' : ('ri', 'deflt-font', self.default_func), 'mac' : ('ri', 'macintosh_', self.default_func), + 'pc' : ('ri', 'pc________', self.default_func), + 'pca' : ('ri', 'pca_______', self.default_func), 'ansi' : ('ri', 'ansi______', self.default_func), 'ansicpg' : ('ri', 'ansi-codpg', self.default_func), # notes => nt 'footnote' : ('nt', 'footnote__', self.default_func), 'ftnalt' : ('nt', 'type______ an - 'tc' : ('an', 'toc_______', self.default_func), + 'tc' : ('an', 'toc_______', self.default_func), 'bkmkstt' : ('an', 'book-mk-st', self.default_func), - 'bkmkstart' : ('an', 'book-mk-st', self.default_func), + 'bkmkstart' : ('an', 'book-mk-st', self.default_func), 'bkmkend' : ('an', 'book-mk-en', self.default_func), 'xe' : ('an', 'index-mark', self.default_func), 'rxe' : ('an', 'place_____', self.default_func), @@ -347,7 +353,7 @@ class ProcessTokens: 10: 'Kanji numbering without the digit character', 11: 'Kanji numbering with the digit character', 1246: 'phonetic Katakana characters in aiueo order', - 1346: 'phonetic katakana characters in iroha order', + 1346: 'phonetic katakana characters in iroha order', 14: 'double byte character', 15: 'single byte character', 16: 'Kanji numbering 3', @@ -392,7 +398,7 @@ class ProcessTokens: 5121 : 'Arabic Algeria', 15361 : 'Arabic Bahrain', 3073 : 'Arabic Egypt', - 1 : 'Arabic General', + 1 : 'Arabic General', 2049 : 'Arabic Iraq', 11265 : 'Arabic Jordan', 13313 : 'Arabic Kuwait', @@ -417,7 +423,7 @@ class ProcessTokens: 1059 : 'Byelorussian', 1027 : 'Catalan', 2052 : 'Chinese China', - 4 : 'Chinese General', + 4 : 'Chinese General', 3076 : 'Chinese Hong Kong', 4100 : 'Chinese Singapore', 1028 : 'Chinese Taiwan', @@ -431,7 +437,7 @@ class ProcessTokens: 2057 : 'English British', 4105 : 'English Canada', 9225 : 'English Caribbean', - 9 : 'English General', + 9 : 'English General', 6153 : 'English Ireland', 8201 : 'English Jamaica', 5129 : 'English New Zealand', @@ -595,30 +601,37 @@ class ProcessTokens: num = num[1:] # chop off leading 0, which I added num = num.upper() # the mappings store hex in caps return 'tx 3: - msg = 'number "%s" cannot be converted to integer\n' % num + msg = 'Number "%s" cannot be converted to integer\n' % num raise self.__bug_handler, msg type = self.__number_type_dict.get(num) - if type == None: + if type is None: if self.__run_level > 3: msg = 'No type for "%s" in self.__number_type_dict\n' raise self.__bug_handler type = 'Arabic' return 'cw<%s<%snum<%s\n' % (token, num) + def divide_by_2(self, pre, token, num): num = self.divide_num(num, 2) return 'cw<%s<%s%s<%s\n' % (token, num, token) + def divide_by_20(self, pre, token, num): num = self.divide_num(num, 20) return 'cw<%s<%s%s<%s\n' % (token, num, token) + def text_func(self, pre, token, num=None): return 'tx%s<%s\n' % (third_field, token, num, token) + def bool_st_func(self, pre, token, num): if num is None or num == '' or num == '1': return 'cw<%s<%sfalse<%s\n' % (token, token) else: - msg = 'boolean should have some value module process tokens\n' - msg += 'token is ' + token + "\n" - msg += "'" + num + "'" + "\n" + msg = "boolean should have some value module process tokens\ntoken is %s\n'%s'\n" % (token, num) raise self.__bug_handler, msg + def __no_sup_sub_func(self, pre, token, num): the_string = 'cw 3: - msg = 'no number to process?\n' - msg += 'this indicates that the token ' - msg += ' \(\\li\) should have a number and does not\n' - msg += 'numerator is "%s"\n' % numerator - msg += 'denominator is "%s"\n' % denominator + msg = ('No number to process?\nthis indicates that the token \(\\li\) \ + should have a number and does not\nnumerator is \ + "%s"\ndenominator is "%s"\n') % (numerator, denominator) raise self.__bug_handler, msg if 5 > self.__return_code: self.__return_code = 5 @@ -698,9 +716,10 @@ class ProcessTokens: if string_num[-2:] == ".0": string_num = string_num[:-2] return string_num + def split_let_num(self, token): match_obj = re.search(self.__num_exp,token) - if match_obj != None: + if match_obj is not None: first = match_obj.group(1) second = match_obj.group(2) if not second: @@ -714,6 +733,7 @@ class ProcessTokens: raise self.__bug_handler return token, 0 return first, second + def convert_to_hex(self,number): """Convert a string to uppercase hexidecimal""" num = int(number) @@ -722,6 +742,7 @@ class ProcessTokens: return hex_num except: raise self.__bug_handler + def process_cw(self, token): """Change the value of the control word by determining what dictionary it belongs to""" @@ -737,89 +758,62 @@ class ProcessTokens: pre, token, action = self.dict_token.get(token, (None, None, None)) if action: return action(pre, token, num) - # unused function - def initiate_token_actions(self): - self.action_for_token={ - '{' : self.ob_func, - '}' : self.cb_func, - '\\' : self.process_cw, - } - # unused function - def evaluate_token(self,token): - """Evaluate tokens. Return a value if the token is not a - control word. Otherwise, pass token onto another method - for further evaluation.""" - token, action = self.dict_token.get(token[0:1]) - if action: - line = action(token) - return line - else : - return 'tx -1: - msg ='Invalid RTF: token "\\ " not valid. \n' - raise self.__exception_handler, msg - elif token[0:1] == "\\": - line = self.process_cw(token) - if line != None: - write_obj.write(line) - else: - fields = re.split(self.__utf_exp, token) - for field in fields: - if not field: - continue - if field[0:1] == '&': - write_obj.write('tx -1: + msg = 'Invalid RTF: token "\\ " not valid.\n' + raise self.__exception_handler, msg + elif token[:1] == "\\": + try: + token.decode('us-ascii') + except UnicodeError, msg: + msg = 'Invalid RTF: Tokens not ascii encoded.\n%s' % str(msg) + raise self.__exception_handler, msg + line = self.process_cw(token) + if line is not None: + write_obj.write(line) else: - write_obj.write('tx", ">") - line = line.replace("\\~", "\\~ ") - line = line.replace("\\_", "\\_ ") - line = line.replace("\\:", "\\: ") - line = line.replace("\\-", "\\- ") - # turn into a generic token to eliminate special - # cases and make processing easier - line = line.replace("\\{", "\\ob ") - # turn into a generic token to eliminate special - # cases and make processing easier - line = line.replace("\\}", "\\cb ") - # put a backslash in front of to eliminate special cases and - # make processing easier - line = line.replace("{", "\\{") - # put a backslash in front of to eliminate special cases and - # make processing easier - line = line.replace("}", "\\}") - line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line) - # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line) - line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line) - ##line = line.replace("\\backslash", "\\\\") - # this is for older RTF - line = re.sub(self.__par_exp, '\\par ', line) - return line - def __compile_expressions(self): - self.__ms_hex_exp = re.compile(r"\\\'(..)") - self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}") - self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)") - self.__par_exp = re.compile(r'\\$') - self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") - ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") - def __create_tokens(self): self.__compile_expressions() - read_obj = open(self.__file, 'r') - write_obj = open(self.__write_to, 'w') - line_to_read = "dummy" - while line_to_read: - line_to_read = read_obj.readline() - line = line_to_read - line = line.replace("\n", "") - line = self.__sub_line_reg(line) - tokens = re.split(self.__splitexp, line) - ##print tokens - for token in tokens: - if token != "": - write_obj.write(token + "\n") - """ - match_obj = re.search(self.__mixed_exp, token) - if match_obj != None: - first = match_obj.group(1) - second = match_obj.group(2) - write_obj.write(first + "\n") - write_obj.write(second + "\n") - else: - write_obj.write(token + "\n") - """ - read_obj.close() - write_obj.close() + #variables + self.__uc_char = 0 + self.__uc_bin = False + self.__uc_value = [1] + + def __reini_utf8_counters(self): + self.__uc_char = 0 + self.__uc_bin = False + + def __remove_uc_chars(self, startchar, token): + for i in xrange(startchar, len(token)): + if token[i] == " ": + continue + elif self.__uc_char: + self.__uc_char -= 1 + else: + return token[i:] + #if only " " and char to skip + return '' + + def __unicode_process(self, token): + #change scope in + if token == '\{': + self.__uc_value.append(self.__uc_value[-1]) + #basic error handling + self.__reini_utf8_counters() + return token + #change scope out + elif token == '\}': + self.__uc_value.pop() + self.__reini_utf8_counters() + return token + #add a uc control + elif token[:3] == '\uc': + self.__uc_value[-1] = int(token[3:]) + self.__reini_utf8_counters() + return token + #bin data to slip + elif self.__uc_bin: + self.__uc_bin = False + return '' + #uc char to remove + elif self.__uc_char: + #handle \bin tag in case of uc char to skip + if token[:4] == '\bin': + self.__uc_char -=1 + self.__uc_bin = True + return '' + elif token[:1] == "\\" : + self.__uc_char -=1 + return '' + else: + return self.__remove_uc_chars(0, token) + #go for real \u token + match_obj = self.__utf_exp.match(token) + if match_obj is not None: + self.__reini_utf8_counters() + #get value and handle negative case + uni_char = int(match_obj.group(1)) + uni_len = len(match_obj.group(1)) + 2 + if uni_char < 0: + uni_char += 65536 + uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace') + self.__uc_char = self.__uc_value[-1] + #there is only an unicode char + if len(token)<= uni_len: + return uni_char + #an unicode char and something else + #must be after as it is splited on \ + #necessary? maybe for \bin? + elif not self.__uc_char: + return uni_char + token[uni_len:] + #if not uc0 and chars + else: + return uni_char + self.__remove_uc_chars(uni_len, token) + #default + return token + + def __sub_reg_split(self,input_file): + input_file = self.__replace_spchar.mreplace(input_file) + input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) + input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) + #remove \n in bin data + input_file = self.__bin_exp.sub(lambda x: \ + x.group().replace('\n', '') + '\n', input_file) + #split + tokens = re.split(self.__splitexp, input_file) + #remove empty tokens and \n + return filter(lambda x: len(x) > 0 and x != '\n', tokens) + #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file) + # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line) + # this is for older RTF + #line = re.sub(self.__par_exp, '\\par ', line) + #return filter(lambda x: len(x) > 0, \ + #(self.__remove_line.sub('', x) for x in tokens)) + + def __compile_expressions(self): + SIMPLE_RPL = { + "\\\\": "\\backslash ", + "\\~": "\\~ ", + "\\;": "\\; ", + "&": "&", + "<": "<", + ">": ">", + "\\~": "\\~ ", + "\\_": "\\_ ", + "\\:": "\\: ", + "\\-": "\\- ", + # turn into a generic token to eliminate special + # cases and make processing easier + "\\{": "\\ob ", + # turn into a generic token to eliminate special + # cases and make processing easier + "\\}": "\\cb ", + # put a backslash in front of to eliminate special cases and + # make processing easier + "{": "\\{", + # put a backslash in front of to eliminate special cases and + # make processing easier + "}": "\\}", + # this is for older RTF + r'\\$': '\\par ', + } + self.__replace_spchar = MReplace(SIMPLE_RPL) + #add ;? in case of char following \u + self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)" + self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?") + self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+") + #manage upr/ud situations + self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \ + r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}") + #add \n in split for whole file reading + #why keep backslash whereas \is replaced before? + #remove \n from endline char + self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") + #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}") + #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})") + #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)") + #self.__par_exp = re.compile(r'\\$') + #self.__remove_line = re.compile(r'\n+') + #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)") + ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)") + def tokenize(self): - """Main class for handling other methods. Reads in one line \ - at a time, usues method self.sub_line to make basic substitutions,\ - uses ? to process tokens""" - self.__create_tokens() + """Main class for handling other methods. Reads the file \ + , uses method self.sub_reg to make basic substitutions,\ + and process tokens by itself""" + #read + with open(self.__file, 'r') as read_obj: + input_file = read_obj.read() + + #process simple replacements and split giving us a correct list + #remove '' and \n in the process + tokens = self.__sub_reg_split(input_file) + #correct unicode + tokens = map(self.__unicode_process, tokens) + #remove empty items created by removing \uc + tokens = filter(lambda x: len(x) > 0, tokens) + + #write + with open(self.__write_to, 'wb') as write_obj: + write_obj.write('\n'.join(tokens)) + #Move and copy copy_obj = copy.Copy(bug_handler = self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "tokenize.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) + + #self.__special_tokens = [ '_', '~', "'", '{', '}' ] \ No newline at end of file diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 3702bbfabe..e1979063c0 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -1,4 +1,8 @@ # -*- coding: utf-8 -*- +__license__ = 'GPL v3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + ''' Read content from txt file. @@ -10,10 +14,7 @@ from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor from calibre.ebooks.conversion.preprocess import DocAnalysis - -__license__ = 'GPL v3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' +from calibre.utils.cleantext import clean_ascii_chars HTML_TEMPLATE = u'%s\n%s\n' @@ -33,9 +34,7 @@ def clean_txt(txt): # Remove excessive line breaks. txt = re.sub('\n{3,}', '\n\n', txt) #remove ASCII invalid chars : 0 to 8 and 11-14 to 24 - chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) - illegal_chars = re.compile(u'|'.join(map(unichr, chars))) - txt = illegal_chars.sub('', txt) + txt = clean_ascii_chars(txt) return txt diff --git a/src/calibre/gui2/catalog/catalog_bibtex.py b/src/calibre/gui2/catalog/catalog_bibtex.py index 5030cf6ec8..7b7739bb46 100644 --- a/src/calibre/gui2/catalog/catalog_bibtex.py +++ b/src/calibre/gui2/catalog/catalog_bibtex.py @@ -27,14 +27,17 @@ class PluginWidget(QWidget, Ui_Form): def __init__(self, parent=None): QWidget.__init__(self, parent) self.setupUi(self) - from calibre.library.catalog import FIELDS - self.all_fields = [] - for x in FIELDS : - if x != 'all': - self.all_fields.append(x) - QListWidgetItem(x, self.db_fields) def initialize(self, name, db): #not working properly to update + from calibre.library.catalog import FIELDS + + self.all_fields = [x for x in FIELDS if x != 'all'] + #add custom columns + self.all_fields.extend([x for x in sorted(db.custom_field_keys())]) + #populate + for x in self.all_fields: + QListWidgetItem(x, self.db_fields) + self.name = name fields = gprefs.get(name+'_db_fields', self.all_fields) # Restore the activated db_fields from last use diff --git a/src/calibre/gui2/dialogs/drm_error.py b/src/calibre/gui2/dialogs/drm_error.py new file mode 100644 index 0000000000..5fbba47165 --- /dev/null +++ b/src/calibre/gui2/dialogs/drm_error.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +from PyQt4.Qt import QDialog +from calibre.gui2.dialogs.drm_error_ui import Ui_Dialog + +class DRMErrorMessage(QDialog, Ui_Dialog): + + def __init__(self, parent=None, title=None): + QDialog.__init__(self, parent) + self.setupUi(self) + if title is not None: + t = unicode(self.msg.text()) + self.msg.setText('

%s

%s'%(title, t)) + self.resize(self.sizeHint()) + diff --git a/src/calibre/gui2/dialogs/drm_error.ui b/src/calibre/gui2/dialogs/drm_error.ui new file mode 100644 index 0000000000..842807c9bc --- /dev/null +++ b/src/calibre/gui2/dialogs/drm_error.ui @@ -0,0 +1,102 @@ + + + Dialog + + + + 0 + 0 + 417 + 235 + + + + This book is DRMed + + + + + + + 0 + 0 + + + + + 132 + 16777215 + + + + + + + :/images/document-encrypt.png + + + + + + + <p>This book is locked by <b>DRM</b>. To learn more about DRM and why you cannot read or convert this book in calibre, +<a href="http://bugs.calibre-ebook.com/wiki/DRM">click here</a>. + + + true + + + true + + + + + + + Qt::Horizontal + + + QDialogButtonBox::Close + + + + + + + + + + + buttonBox + accepted() + Dialog + accept() + + + 248 + 254 + + + 157 + 274 + + + + + buttonBox + rejected() + Dialog + reject() + + + 316 + 260 + + + 286 + 274 + + + + + diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py index e1ee4327f3..5ea8f00148 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.py +++ b/src/calibre/gui2/dialogs/metadata_bulk.py @@ -15,7 +15,7 @@ from calibre.ebooks.metadata import string_to_authors, authors_to_string from calibre.ebooks.metadata.book.base import composite_formatter from calibre.ebooks.metadata.meta import get_metadata from calibre.gui2.custom_column_widgets import populate_metadata_page -from calibre.gui2 import error_dialog +from calibre.gui2 import error_dialog, ResizableDialog from calibre.gui2.progress_indicator import ProgressIndicator from calibre.utils.config import dynamic from calibre.utils.titlecase import titlecase @@ -49,7 +49,7 @@ def get_cover_data(path): -class MyBlockingBusy(QDialog): +class MyBlockingBusy(QDialog): # {{{ do_one_signal = pyqtSignal() @@ -241,8 +241,9 @@ class MyBlockingBusy(QDialog): self.current_index += 1 self.do_one_signal.emit() + # }}} -class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog): +class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog): s_r_functions = { '' : lambda x: x, _('Lower Case') : lambda x: icu_lower(x), @@ -261,9 +262,8 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog): ] def __init__(self, window, rows, model, tab): - QDialog.__init__(self, window) + ResizableDialog.__init__(self, window) Ui_MetadataBulkDialog.__init__(self) - self.setupUi(self) self.model = model self.db = model.db self.ids = [self.db.id(r) for r in rows] diff --git a/src/calibre/gui2/dialogs/metadata_bulk.ui b/src/calibre/gui2/dialogs/metadata_bulk.ui index 41858b099b..9240cd1af8 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.ui +++ b/src/calibre/gui2/dialogs/metadata_bulk.ui @@ -6,8 +6,8 @@ 0 0 - 752 - 633 + 850 + 650 @@ -17,8 +17,8 @@ :/images/edit_input.png:/images/edit_input.png - - + + @@ -28,818 +28,836 @@ - - - - 6 + + + + QFrame::NoFrame - + 0 - - - + + true + + + + + 0 + 0 + 842 + 589 + + + + 0 - - - &Basic metadata - - - - - - &Author(s): - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - authors - - - - - - - A&utomatically set author sort - - - - - - - Author s&ort: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - author_sort - - - - - - - Specify how the author(s) of this book should be sorted. For example Charles Dickens should be sorted as Dickens, Charles. - - - - - - - &Rating: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - rating - - - - - - - Rating of this book. 0-5 stars - - - Rating of this book. 0-5 stars - - - QAbstractSpinBox::PlusMinus - - - No change - - - stars - - - -1 - - - 5 - - - -1 - - - - - - - &Publisher: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - publisher - - - - - - - true - - - - - - - Add ta&gs: - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - tags - - - - - - - Tags categorize the book. This is particularly useful while searching. <br><br>They can be any words or phrases, separated by commas. - - - - - - - Open Tag Editor - - - Open Tag Editor - - - - :/images/chapters.png:/images/chapters.png - - - - - - - &Remove tags: - - - remove_tags - - - - - - - Comma separated list of tags to remove from the books. - - - - - - - Check this box to remove all tags from the books. - - - Remove all - - - - - - - &Series: - - - Qt::PlainText - - - Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter - - - series - - - - - - - + + + + 0 + + + + &Basic metadata + + + + + + &Author(s): + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + authors + + + + + + + A&utomatically set author sort + + + + + + + Author s&ort: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + author_sort + + + + + - List of known series. You can add new series. + Specify how the author(s) of this book should be sorted. For example Charles Dickens should be sorted as Dickens, Charles. + + + + + + + &Rating: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + rating + + + + + + + Rating of this book. 0-5 stars - List of known series. You can add new series. + Rating of this book. 0-5 stars + + QAbstractSpinBox::PlusMinus + + + No change + + + stars + + + -1 + + + 5 + + + -1 + + + + + + + &Publisher: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + publisher + + + + + true - - QComboBox::InsertAlphabetically + + + + + + Add ta&gs: - - QComboBox::AdjustToContents + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + tags - - + + - If checked, the series will be cleared + Tags categorize the book. This is particularly useful while searching. <br><br>They can be any words or phrases, separated by commas. + + + + + + + Open Tag Editor - Clear series + Open Tag Editor + + + + :/images/chapters.png:/images/chapters.png - - - - Qt::Horizontal + + + + &Remove tags: - - - 20 - 0 - + + remove_tags - + - - - - - - + + - If not checked, the series number for the books will be set to 1. + Comma separated list of tags to remove from the books. + + + + + + + Check this box to remove all tags from the books. + + + Remove all + + + + + + + &Series: + + + Qt::PlainText + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + series + + + + + + + + + List of known series. You can add new series. + + + List of known series. You can add new series. + + + true + + + QComboBox::InsertAlphabetically + + + QComboBox::AdjustToContents + + + + + + + If checked, the series will be cleared + + + Clear series + + + + + + + Qt::Horizontal + + + + 20 + 0 + + + + + + + + + + + + If not checked, the series number for the books will be set to 1. If checked, selected books will be automatically numbered, in the order you selected them. So if you selected Book A and then Book B, Book A will have series number 1 and Book B series number 2. - - - Automatically number books in this series - - - - - - - false - - - Series will normally be renumbered from the highest number in the database + + + Automatically number books in this series + + + + + + + false + + + Series will normally be renumbered from the highest number in the database for that series. Checking this box will tell calibre to start numbering from the value in the box + + + Force numbers to start with + + + + + + + false + + + 1 + + + 990000 + + + 1 + + + + + + + Qt::Horizontal + + + + 20 + 10 + + + + + + + + + + Remove &format: + + + remove_format + + + + + + + + + + true + + + + + + + &Swap title and author + + + + + + + Force the title to be in title case. If both this and swap authors are checked, +title and author are swapped before the title case is set - Force numbers to start with + Change title to title case - - - - false - - - 1 - - - 990000 - - - 1 - - - - - - - Qt::Horizontal - - - - 20 - 10 - - - - - - - - - - Remove &format: - - - remove_format - - - - - - - - - - true - - - - - - - &Swap title and author - - - - - - - Force the title to be in title case. If both this and swap authors are checked, -title and author are swapped before the title case is set - - - Change title to title case - - - - - - - Remove stored conversion settings for the selected books. + + + + Remove stored conversion settings for the selected books. Future conversion of these books will use the default settings. - - - Remove &stored conversion settings for the selected books - - - - - - - Qt::Vertical - - - - 20 - 40 - - - - - - - - Change &cover - - - - - - &Generate default cover - - - - - - - &Remove cover - - - - - - - Set from &ebook file(s) - - - - - - - - - - - &Custom metadata - - - - - &Search and replace - - - - QLayout::SetMinimumSize - - - - - true - - - true - - - - - - - - - - - - - - Search &field: - - - search_field - - - - - - - The name of the field that you want to search - - - - - - - + - Search &mode: - - - search_mode + Remove &stored conversion settings for the selected books - - - - Choose whether to use basic text matching or advanced regular expression matching - - - - - + + - Qt::Horizontal + Qt::Vertical 20 - 10 + 40 - - - - - - Te&mplate: - - - s_r_template - - - - - - - - 100 - 0 - - - - Enter a template to be used as the source for the search/replace - - - - - - - &Search for: - - - search_for - - - - - - - - 100 - 0 - - - - Enter the what you are looking for, either plain text or a regular expression, depending on the mode - - - - - - - Check this box if the search string must match exactly upper and lower case. Uncheck it if case is to be ignored - - - Cas&e sensitive - - - true - - - - - - - &Replace with: - - - replace_with - - - - - - - The replacement text. The matched search text will be replaced with this string - - - - - - - - - &Apply function after replace: - - - replace_func + + + + Change &cover + + + + + &Generate default cover + + + + + + + &Remove cover + + + + + + + Set from &ebook file(s) + + + + - - - - Specify how the text is to be processed after matching and replacement. In character mode, the entire -field is processed. In regular expression mode, only the matched text is processed - - - - - - - Qt::Horizontal - - - - 20 - 10 - - - - - - - - - &Destination field: + + + + &Custom metadata + + + + + &Search and replace + + + + QLayout::SetMinimumSize - - destination_field - - - - - - - The field that the text will be put into after all replacements. -If blank, the source field is used if the field is modifiable - - - - - - - + + + + true + + + true + + + + + - M&ode: + + + + + + + + Search &field: - replace_mode + search_field - - + + - Specify how the text should be copied into the destination. + The name of the field that you want to search - - + + + + + + Search &mode: + + + search_mode + + + + + + + Choose whether to use basic text matching or advanced regular expression matching + + + + + + + Qt::Horizontal + + + + 20 + 10 + + + + + + + + + + Te&mplate: + + + s_r_template + + + + + + + + 100 + 0 + + - Specifies whether result items should be split into multiple values or -left as single values. This option has the most effect when the source field is -not multiple and the destination field is multiple + Enter a template to be used as the source for the search/replace + + + + + + + &Search for: + + + search_for + + + + + + + + 100 + 0 + + + + Enter the what you are looking for, either plain text or a regular expression, depending on the mode + + + + + + + Check this box if the search string must match exactly upper and lower case. Uncheck it if case is to be ignored - Split &result + Cas&e sensitive true - - - - Qt::Horizontal - - - - 20 - 10 - - - - - - - - - - - - Qt::Horizontal - - - - 20 - 0 - - - - - - + + - For multiple-valued fields, sho&w + &Replace with: - results_count + replace_with - - - - true - - - 1 - - - 999 - - - 999 - - - - - - - values starting a&t - - - starting_from - - - - - - - true - - - 1 - - - 999 - - - 1 - - - - - - - with values separated b&y - - - multiple_separator - - - - - + + - Used when displaying test results to separate values in multiple-valued fields + The replacement text. The matched search text will be replaced with this string - - - - - - QFrame::NoFrame - - - true - - - - - 0 - 0 - 726 - 334 - - - - - + + + + - Test text + &Apply function after replace: + + + replace_func - - - - Test result + + + + Specify how the text is to be processed after matching and replacement. In character mode, the entire +field is processed. In regular expression mode, only the matched text is processed - - - - Your test: - - - - - - - - - - - + + - Qt::Vertical + Qt::Horizontal 20 - 5 + 10 - - - - - - - - + + + + + &Destination field: + + + destination_field + + + + + + + The field that the text will be put into after all replacements. +If blank, the source field is used if the field is modifiable + + + + + + + + + M&ode: + + + replace_mode + + + + + + + Specify how the text should be copied into the destination. + + + + + + + Specifies whether result items should be split into multiple values or +left as single values. This option has the most effect when the source field is +not multiple and the destination field is multiple + + + Split &result + + + true + + + + + + + Qt::Horizontal + + + + 20 + 10 + + + + + + + + + + + + Qt::Horizontal + + + + 20 + 0 + + + + + + + + For multiple-valued fields, sho&w + + + results_count + + + + + + + true + + + 1 + + + 999 + + + 999 + + + + + + + values starting a&t + + + starting_from + + + + + + + true + + + 1 + + + 999 + + + 1 + + + + + + + with values separated b&y + + + multiple_separator + + + + + + + Used when displaying test results to separate values in multiple-valued fields + + + + + + + + + QFrame::NoFrame + + + true + + + + + 0 + 0 + 197 + 60 + + + + + + + Test text + + + + + + + Test result + + + + + + + Your test: + + + + + + + + + + + + + Qt::Vertical + + + + 20 + 5 + + + + + + + + + + + + + + + - + Qt::Horizontal @@ -893,7 +911,6 @@ not multiple and the destination field is multiple swap_title_and_author change_title_to_title_case button_box - central_widget search_field search_mode s_r_template diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index c2588f57a8..a4e8bb6972 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -823,7 +823,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): if book.series_index is not None: self.series_index.setValue(book.series_index) if book.has_cover: - if d.opt_auto_download_cover.isChecked() and book.has_cover: + if d.opt_auto_download_cover.isChecked(): self.fetch_cover() else: self.fetch_cover_button.setFocus(Qt.OtherFocusReason) diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py index 670a2d823e..6fa23c2813 100644 --- a/src/calibre/gui2/library/models.py +++ b/src/calibre/gui2/library/models.py @@ -384,8 +384,9 @@ class BooksModel(QAbstractTableModel): # {{{ name, val = mi.format_field(key) if mi.metadata_for_field(key)['datatype'] == 'comments': name += ':html' - if val: + if val and name not in data: data[name] = val + return data diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py index a6eeabd57f..01d3180778 100644 --- a/src/calibre/gui2/ui.py +++ b/src/calibre/gui2/ui.py @@ -468,12 +468,8 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{ try: if 'calibre.ebooks.DRMError' in job.details: if not minz: - d = error_dialog(self, _('Conversion Error'), - _('

Could not convert: %s

It is a ' - 'DRMed book. You must first remove the ' - 'DRM using third party tools.')%\ - (job.description.split(':')[-1], - 'http://bugs.calibre-ebook.com/wiki/DRM')) + from calibre.gui2.dialogs.drm_error import DRMErrorMessage + d = DRMErrorMessage(self, job.description.split(':')[-1]) d.setModal(False) d.show() self._modeless_dialogs.append(d) diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index 25f69b1558..c5001659a0 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -26,6 +26,7 @@ from calibre.gui2.search_box import SearchBox2 from calibre.ebooks.metadata import MetaInformation from calibre.customize.ui import available_input_formats from calibre.gui2.viewer.dictionary import Lookup +from calibre import as_unicode class TOCItem(QStandardItem): @@ -626,13 +627,12 @@ class EbookViewer(MainWindow, Ui_EbookViewer): QApplication.processEvents() if worker.exception is not None: if isinstance(worker.exception, DRMError): - error_dialog(self, _('DRM Error'), - _('

This book is protected by DRM') - %'http://wiki.mobileread.com/wiki/DRM').exec_() + from calibre.gui2.dialogs.drm_error import DRMErrorMessage + DRMErrorMessage(self).exec_() else: r = getattr(worker.exception, 'reason', worker.exception) error_dialog(self, _('Could not open ebook'), - unicode(r), det_msg=worker.traceback, show=True) + as_unicode(r), det_msg=worker.traceback, show=True) self.close_progress_indicator() else: self.metadata.show_opf(self.iterator.opf, os.path.splitext(pathtoebook)[1][1:]) diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 98cc4b7ecd..5cda9baa8c 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -1531,10 +1531,23 @@ class EPUB_MOBI(CatalogPlugin): self.opts.header_note_source_field, index_is_id=True) if notes: - if field_md['datatype'] == 'text' and isinstance(notes,list): - notes = ' · '.join(notes) + if field_md['datatype'] == 'text': + if isinstance(notes,list): + notes = ' · '.join(notes) elif field_md['datatype'] == 'datetime': notes = format_date(notes,'dd MMM yyyy') + elif field_md['datatype'] == 'composite': + m = re.match(r'\[(.+)\]$', notes) + if m is not None: + # Sniff for special pseudo-list string "[]" + bracketed_content = m.group(1) + if ',' in bracketed_content: + # Recast the comma-separated items as a list + items = bracketed_content.split(',') + items = [i.strip() for i in items] + notes = ' · '.join(items) + else: + notes = bracketed_content this_title['notes'] = {'source':field_md['name'], 'content':notes} diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 8a72ec040c..c2381938fb 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -709,6 +709,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): formats = row[fm['formats']] if not formats: formats = None + else: + formats = formats.split(',') mi.formats = formats tags = row[fm['tags']] if tags: diff --git a/src/calibre/trac/bzr_commit_plugin.py b/src/calibre/trac/bzr_commit_plugin.py index df6bf699d1..6c36115cae 100644 --- a/src/calibre/trac/bzr_commit_plugin.py +++ b/src/calibre/trac/bzr_commit_plugin.py @@ -110,6 +110,7 @@ class cmd_commit(_cmd_commit): suffix = 'The fix will be in the next release.' action = action+'ed' msg = '%s in branch %s. %s'%(action, nick, suffix) + msg = msg.replace('Fixesed', 'Fixed') server = xmlrpclib.ServerProxy(url) server.ticket.update(int(bug), msg, {'status':'closed', 'resolution':'fixed'}, diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py index b4afe7576d..938960df93 100644 --- a/src/calibre/utils/cleantext.py +++ b/src/calibre/utils/cleantext.py @@ -3,7 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian ' __docformat__ = 'restructuredtext en' -import re +import re, htmlentitydefs _ascii_pat = None @@ -21,3 +21,32 @@ def clean_ascii_chars(txt, charlist=None): pat = re.compile(u'|'.join(map(unichr, charlist))) return pat.sub('', txt) +## +# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html +# Removes HTML or XML character references and entities from a text string. +# +# @param text The HTML (or XML) source text. +# @return The plain text, as a Unicode string, if necessary. + +def unescape(text, rm=False, rchar=u''): + def fixup(m, rm=rm, rchar=rchar): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + if rm: + return rchar #replace by char + return text # leave as is + return re.sub("&#?\w+;", fixup, text) diff --git a/src/calibre/utils/magick/draw.py b/src/calibre/utils/magick/draw.py index c03a8660c8..ad4b681b43 100644 --- a/src/calibre/utils/magick/draw.py +++ b/src/calibre/utils/magick/draw.py @@ -92,7 +92,10 @@ def identify_data(data): or raises an Exception if data is not an image. ''' img = Image() - img.load(data) + if hasattr(img, 'identify'): + img.identify(data) + else: + img.load(data) width, height = img.size fmt = img.format return (width, height, fmt) diff --git a/src/calibre/utils/magick/magick.c b/src/calibre/utils/magick/magick.c index fd9563529a..869b77c736 100644 --- a/src/calibre/utils/magick/magick.c +++ b/src/calibre/utils/magick/magick.c @@ -456,6 +456,26 @@ magick_Image_load(magick_Image *self, PyObject *args, PyObject *kwargs) { // }}} +// Image.identify {{{ +static PyObject * +magick_Image_identify(magick_Image *self, PyObject *args, PyObject *kwargs) { + const char *data; + Py_ssize_t dlen; + MagickBooleanType res; + + NULL_CHECK(NULL) + if (!PyArg_ParseTuple(args, "s#", &data, &dlen)) return NULL; + + res = MagickPingImageBlob(self->wand, data, dlen); + + if (!res) + return magick_set_exception(self->wand); + + Py_RETURN_NONE; +} + +// }}} + // Image.open {{{ static PyObject * magick_Image_read(magick_Image *self, PyObject *args, PyObject *kwargs) { @@ -993,6 +1013,10 @@ static PyMethodDef magick_Image_methods[] = { {"destroy", (PyCFunction)magick_Image_destroy, METH_VARARGS, "Destroy the underlying ImageMagick Wand. WARNING: After using this method, all methods on this object will raise an exception."}, + {"identify", (PyCFunction)magick_Image_identify, METH_VARARGS, + "Identify an image from a byte buffer (string)" + }, + {"load", (PyCFunction)magick_Image_load, METH_VARARGS, "Load an image from a byte buffer (string)" }, diff --git a/src/calibre/utils/wmf/parse.py b/src/calibre/utils/wmf/parse.py new file mode 100644 index 0000000000..c618884e33 --- /dev/null +++ b/src/calibre/utils/wmf/parse.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import sys, struct + + + +class WMFHeader(object): + + ''' + For header documentation, see + http://www.skynet.ie/~caolan/publink/libwmf/libwmf/doc/ora-wmf.html + ''' + + def __init__(self, data, log, verbose): + self.log, self.verbose = log, verbose + offset = 0 + file_type, header_size, windows_version = struct.unpack_from(' 0: + params = data[offset:offset+delta] + offset += delta + + func = self.function_map.get(func, func) + + if self.verbose > 3: + self.log.debug('WMF Record:', size, func) + self.records.append((func, params)) + + for rec in self.records: + f = getattr(self, rec[0], None) + if callable(f): + f(rec[1]) + elif self.verbose > 2: + self.log.debug('Ignoring record:', rec[0]) + + self.has_raster_image = len(self.bitmaps) > 0 + + + def SetMapMode(self, params): + if len(params) == 2: + self.map_mode = struct.unpack('