From 353dd847d19b832552a978c1273c42fa6e5ac230 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Jul 2022 09:26:01 +0530 Subject: [PATCH] Fix #1983126 [Metadata download: Paragraph breaks not working for Google](https://bugs.launchpad.net/calibre/+bug/1983126) --- src/calibre/ebooks/metadata/sources/google.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index e615c0b1b1..10086e1ee8 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -15,7 +15,7 @@ try: except ImportError: from Queue import Empty, Queue -from calibre import as_unicode +from calibre import as_unicode, replace_entities from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import authors_to_string, check_isbn from calibre.ebooks.metadata.book.base import Metadata @@ -31,6 +31,15 @@ NAMESPACES = { } +def pretty_google_books_comments(raw): + raw = replace_entities(raw) + # Paragraphs in the comments are removed but whatever software googl uses + # to do this does not insert a space so we often find the pattern + # word.Capital in the comments which can be used to find paragraph markers. + raw = re.sub(r'([a-z])\.([A-Z])', '\\1.\n\n\\2', raw) + return raw + + def get_details(browser, url, timeout): # {{{ try: raw = browser.open_novisit(url, timeout=timeout).read() @@ -187,7 +196,7 @@ def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{ class GoogleBooks(Source): name = 'Google' - version = (1, 0, 7) + version = (1, 0, 8) minimum_calibre_version = (2, 80, 0) description = _('Downloads metadata and covers from Google Books') @@ -342,6 +351,8 @@ class GoogleBooks(Source): self.cache_isbn_to_identifier(isbn, goog) if getattr(ans, 'has_google_cover', False): self.cache_identifier_to_cover_url(goog, self.GOOGLE_COVER % goog) + if ans.comments: + ans.comments = pretty_google_books_comments(ans.comments) self.clean_downloaded_metadata(ans) return ans # }}}