HTML Input: switch to using a stack rather than recursion

Allows processing deeply nested input without running out of stack
space. This is especially an issue on windows 64bit where the stack is
quite small. Fixes #1981438 [Private bug](https://bugs.launchpad.net/calibre/+bug/1981438)
This commit is contained in:
Kovid Goyal 2022-07-12 22:29:33 +05:30
parent 390e47bb4d
commit 4656e79cba
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 62 additions and 23 deletions

View File

@ -175,25 +175,67 @@ class HTMLFile:
return Link(url, self.base)
def depth_first(root, flat, visited=None):
def depth_first(root, flat):
yield root
if visited is None:
visited = set()
visited = set()
visited.add(root)
for link in root.links:
if link.path is not None and link not in visited:
try:
index = flat.index(link)
except ValueError: # Can happen if max_levels is used
continue
hf = flat[index]
if hf not in visited:
yield hf
visited.add(hf)
for hf in depth_first(hf, flat, visited):
if hf not in visited:
yield hf
visited.add(hf)
from collections import deque
stack = deque()
def add_links_from(item):
for link in reversed(item.links):
if link.path is not None and link not in visited:
stack.appendleft(link)
add_links_from(root)
while stack:
link = stack.popleft()
try:
index = flat.index(link)
except ValueError: # Can happen if max_levels is used
continue
hf = flat[index]
if hf not in visited:
yield hf
visited.add(hf)
add_links_from(hf)
def find_tests():
import unittest
class HF:
def __init__(self, path):
self.path = path
self.links = []
def a(self, hf):
self.links.append(hf)
return hf
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
def __hash__(self):
return hash(self.path)
def __repr__(self):
return self.path
class TestHTMLInput(unittest.TestCase):
def test_depth_first(self):
root = HF('root')
a = root.a(HF('a'))
a1 = a.a(HF('a1'))
x = a1.a(HF('x'))
a2 = a.a(HF('a2'))
b = root.a(HF('b'))
b1 = b.a(HF('b1'))
flat = root, a, b, a1, a2, b1, x
self.assertEqual(tuple(depth_first(flat[0], flat)), (root, a, a1, x, a2, b, b1))
return unittest.defaultTestLoader.loadTestsFromTestCase(TestHTMLInput)
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None):
@ -233,12 +275,7 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None
hf.links.remove(link)
next_level = list(nl)
orec = sys.getrecursionlimit()
sys.setrecursionlimit(500000)
try:
return flat, list(depth_first(flat[0], flat))
finally:
sys.setrecursionlimit(orec)
return flat, list(depth_first(flat[0], flat))
def get_filelist(htmlfile, dir, opts, log):

View File

@ -261,6 +261,8 @@ def find_tests(which_tests=None, exclude_tests=None):
from calibre.gui2.viewer.annotations import find_tests
a(find_tests())
if ok('misc'):
from calibre.ebooks.html.input import find_tests
a(find_tests())
from calibre.ebooks.metadata.test_author_sort import find_tests
a(find_tests())
from calibre.ebooks.metadata.tag_mapper import find_tests