From 3a12b18dc353a7256d30c55267af94f035a97338 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 2 Jun 2010 10:46:53 -0600 Subject: [PATCH 1/4] One more fix for NYTimes --- resources/recipes/nytimes.recipe | 8 ++++++-- resources/recipes/nytimes_sub.recipe | 16 ++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe index 33758e8c47..eba717027e 100644 --- a/resources/recipes/nytimes.recipe +++ b/resources/recipes/nytimes.recipe @@ -391,10 +391,14 @@ class NYTimes(BasicNewsRecipe): return ans def preprocess_html(self, soup): - # Skip ad pages before actual article + # Skip ad pages served before actual article skip_tag = soup.find(True, {'name':'skip'}) if skip_tag is not None: - soup = self.index_to_soup(skip_tag.parent['href']) + self.log.error("Found forwarding link: %s" % skip_tag.parent['href']) + url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) + url += '?pagewanted=all' + self.log.error("Skipping ad to article at '%s'" % url) + soup = self.index_to_soup(url) return self.strip_anchors(soup) def postprocess_html(self,soup, True): diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index 79c0d49223..c08b06572d 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -280,18 +280,14 @@ class NYTimes(BasicNewsRecipe): return ans def preprocess_html(self, soup): - ''' - refresh = soup.find('meta', {'http-equiv':'refresh'}) - if refresh is None: - return soup - content = refresh.get('content').partition('=')[2] - raw = self.browser.open('http://www.nytimes.com'+content).read() - return BeautifulSoup(raw.decode('cp1252', 'replace')) - ''' - # Skip ad pages before actual article + # Skip ad pages served before actual article skip_tag = soup.find(True, {'name':'skip'}) if skip_tag is not None: - soup = self.index_to_soup(skip_tag.parent['href']) + self.log.error("Found forwarding link: %s" % skip_tag.parent['href']) + url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) + url += '?pagewanted=all' + self.log.error("Skipping ad to article at '%s'" % url) + soup = self.index_to_soup(url) return self.strip_anchors(soup) def postprocess_html(self,soup, True): From 6221f6747398d8a6ee9e68fd31cddb00b7e2100d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 2 Jun 2010 14:08:17 -0600 Subject: [PATCH 2/4] Fix #5666 (coverter UnicodeDecodeError: 'utf8' codec can't decode byte 0xff in position 57: unexpected code byte) --- src/calibre/ebooks/html/input.py | 4 ++-- src/calibre/ebooks/metadata/html.py | 6 +++++- src/calibre/utils/magick_draw.py | 2 ++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 413db1cc0b..6108aa329d 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -416,9 +416,9 @@ class HTMLInput(InputFormatPlugin): link = unquote(link).replace('/', os.sep) if not link.strip(): return link_ - if base and not os.path.isabs(link): - link = os.path.join(base, link) try: + if base and not os.path.isabs(link): + link = os.path.join(base, link) link = os.path.abspath(link) except: return link_ diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index d5aa9b8bef..45b592c709 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -11,7 +11,7 @@ import re from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.chardet import xml_to_unicode - +from calibre import entity_to_unicode def get_metadata(stream): src = stream.read() @@ -43,6 +43,10 @@ def get_metadata_(src, encoding=None): if match: author = match.group(2).replace(',', ';') + ent_pat = re.compile(r'&(\S+)?;') + title = ent_pat.sub(entity_to_unicode, title) + if author: + author = ent_pat.sub(entity_to_unicode, author) mi = MetaInformation(title, [author] if author else None) # Publisher diff --git a/src/calibre/utils/magick_draw.py b/src/calibre/utils/magick_draw.py index 0288107b45..5625da0869 100644 --- a/src/calibre/utils/magick_draw.py +++ b/src/calibre/utils/magick_draw.py @@ -51,6 +51,8 @@ class FontMetrics(object): def get_font_metrics(image, d_wand, text): + if isinstance(text, unicode): + text = text.encode('utf-8') ret = p.MagickQueryFontMetrics(image, d_wand, text) return FontMetrics(ret) From 4bd4ce1678b0180b0bbc3753923d92ba74fcf62f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 2 Jun 2010 14:24:36 -0600 Subject: [PATCH 3/4] ... --- src/calibre/gui2/main.py | 3 ++- src/calibre/gui2/ui.py | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index c47c821913..73f7f3839d 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -121,7 +121,8 @@ class GuiRunner(QObject): def start_gui(self): from calibre.gui2.ui import Main - main = Main(self.library_path, self.db, self.listener, self.opts, self.actions) + main = Main(self.opts) + main.initialize(self.library_path, self.db, self.listener, self.actions) add_filesystem_book = partial(main.add_filesystem_book, allow_device=False) sys.excepthook = main.unhandled_exception if len(self.args) > 1: diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py index 8bc85e7195..5f7d4b76cd 100644 --- a/src/calibre/gui2/ui.py +++ b/src/calibre/gui2/ui.py @@ -127,13 +127,18 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): pixmap_to_data(pixmap)) self.last_time = datetime.datetime.now() - def __init__(self, library_path, db, listener, opts, actions, parent=None): + + def __init__(self, opts, parent=None): + MainWindow.__init__(self, opts, parent) + self.opts = opts + + def initialize(self, library_path, db, listener, actions): + opts = self.opts self.last_time = datetime.datetime.now() self.preferences_action, self.quit_action = actions self.library_path = library_path self.spare_servers = [] self.must_restart_before_config = False - MainWindow.__init__(self, opts, parent) # Initialize fontconfig in a separate thread as this can be a lengthy # process if run for the first time on this machine from calibre.utils.fonts import fontconfig From d254bac885f7545602e4219bcb1b5e975b4bc636 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 2 Jun 2010 15:02:10 -0600 Subject: [PATCH 4/4] ... --- src/calibre/library/database2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 867ac5dbb7..2307face58 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -987,6 +987,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): def rename_tag(self, old, new): self.conn.execute('UPDATE tags SET name=? WHERE name=?', (new, old)) + self.conn.commit() def get_tags(self, id): result = self.conn.get(