Fix #1983126 [Metadata download: Paragraph breaks not working for Google](https://bugs.launchpad.net/calibre/+bug/1983126)

This commit is contained in:
Kovid Goyal 2022-07-31 09:26:01 +05:30
parent 94dbef4f80
commit 353dd847d1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -15,7 +15,7 @@ try:
except ImportError: except ImportError:
from Queue import Empty, Queue from Queue import Empty, Queue
from calibre import as_unicode from calibre import as_unicode, replace_entities
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import authors_to_string, check_isbn from calibre.ebooks.metadata import authors_to_string, check_isbn
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
@ -31,6 +31,15 @@ NAMESPACES = {
} }
def pretty_google_books_comments(raw):
raw = replace_entities(raw)
# Paragraphs in the comments are removed but whatever software googl uses
# to do this does not insert a space so we often find the pattern
# word.Capital in the comments which can be used to find paragraph markers.
raw = re.sub(r'([a-z])\.([A-Z])', '\\1.\n\n\\2', raw)
return raw
def get_details(browser, url, timeout): # {{{ def get_details(browser, url, timeout): # {{{
try: try:
raw = browser.open_novisit(url, timeout=timeout).read() raw = browser.open_novisit(url, timeout=timeout).read()
@ -187,7 +196,7 @@ def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{
class GoogleBooks(Source): class GoogleBooks(Source):
name = 'Google' name = 'Google'
version = (1, 0, 7) version = (1, 0, 8)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
description = _('Downloads metadata and covers from Google Books') description = _('Downloads metadata and covers from Google Books')
@ -342,6 +351,8 @@ class GoogleBooks(Source):
self.cache_isbn_to_identifier(isbn, goog) self.cache_isbn_to_identifier(isbn, goog)
if getattr(ans, 'has_google_cover', False): if getattr(ans, 'has_google_cover', False):
self.cache_identifier_to_cover_url(goog, self.GOOGLE_COVER % goog) self.cache_identifier_to_cover_url(goog, self.GOOGLE_COVER % goog)
if ans.comments:
ans.comments = pretty_google_books_comments(ans.comments)
self.clean_downloaded_metadata(ans) self.clean_downloaded_metadata(ans)
return ans return ans
# }}} # }}}