diff --git a/setup/plugins_mirror.py b/setup/plugins_mirror.py index aac0c10fc4..e17e648c92 100644 --- a/setup/plugins_mirror.py +++ b/setup/plugins_mirror.py @@ -124,7 +124,7 @@ def parse_index(raw=None): # {{{ return category raise ValueError('Could not find category for offset: ' + str(offset)) - for match in re.finditer(r'''(?is)(.+?)<(.+?)''', raw): + for match in re.finditer(r'''(?is)(.+?)<(.+?)''', raw): name, url, rest = u(match.group(2)), u(match.group(1)), match.group(3) category = category_at(match.start(2)) deprecated = category == deprecated_category diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index 8777babc88..7c8af1caea 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -398,6 +398,7 @@ class GoogleBooks(Source): timeout=30 ): from calibre.utils.filenames import ascii_text + from polyglot.urllib import urlparse isbn = check_isbn(identifiers.get('isbn', None)) q = [] strip_punc_pat = regex.compile(r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE) @@ -440,7 +441,13 @@ class GoogleBooks(Source): pat = re.compile(r'id=([^&]+)') for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False): m = pat.search(q.url) - if m is None or not q.url.startswith('https://books.google'): + if m is None or not q.url: + continue + try: + purl = urlparse(q.url) + except Exception: + continue + if not purl.hostname.startswith('https://books.google'): continue google_ids.append(m.group(1)) diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py index e36eb19abe..44f409d4b6 100644 --- a/src/calibre/ebooks/metadata/sources/search_engines.py +++ b/src/calibre/ebooks/metadata/sources/search_engines.py @@ -336,7 +336,7 @@ def google_parse_results(root, raw, log=prints, ignore_uncached=True): purl = urlparse(url) except Exception: continue - if 'google.com' in purl.netloc: + if purl.hostname.endswith('google.com'): continue try: title = tostring(next(a.iterchildren('span')))