mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Houston Chronicle
This commit is contained in:
parent
c28a031735
commit
38012bd21c
@ -48,7 +48,8 @@ def validate_link(page, link, title):
|
|||||||
if not title or len(title.strip()) < 5:
|
if not title or len(title.strip()) < 5:
|
||||||
print("{0} rejected, title too short".format(link))
|
print("{0} rejected, title too short".format(link))
|
||||||
return None
|
return None
|
||||||
if link.split('/')[3] in other_category:
|
parts = link.split('/')
|
||||||
|
if len(parts) > 3 and parts[3] in other_category:
|
||||||
print("{0} rejected, covered in other section".format(link))
|
print("{0} rejected, covered in other section".format(link))
|
||||||
return None
|
return None
|
||||||
for excluded_title in excluded_titles:
|
for excluded_title in excluded_titles:
|
||||||
@ -69,7 +70,10 @@ def sort_subject(element_list):
|
|||||||
subject_dict = OrderedDict(zip(subjects, range(len(subjects))))
|
subject_dict = OrderedDict(zip(subjects, range(len(subjects))))
|
||||||
rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)])
|
rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)])
|
||||||
for element in element_list:
|
for element in element_list:
|
||||||
subj = element[0].split('/')[3]
|
try:
|
||||||
|
subj = element[0].split('/')[3]
|
||||||
|
except Exception:
|
||||||
|
subj = 'unknown'
|
||||||
if subject_dict.get(subj) is not None:
|
if subject_dict.get(subj) is not None:
|
||||||
rank_dict[subject_dict[subj] + 1].append(element)
|
rank_dict[subject_dict[subj] + 1].append(element)
|
||||||
else:
|
else:
|
||||||
@ -161,7 +165,7 @@ class HoustonChronicle(BasicNewsRecipe):
|
|||||||
for descendant in el.contents:
|
for descendant in el.contents:
|
||||||
if isinstance(descendant, NavigableString):
|
if isinstance(descendant, NavigableString):
|
||||||
result.append(type(u'')(descendant).strip())
|
result.append(type(u'')(descendant).strip())
|
||||||
all_text = u' '.join(result).encode('utf-8')
|
all_text = u' '.join(result)
|
||||||
if len(all_text) > 1:
|
if len(all_text) > 1:
|
||||||
sentences = re.findall(sentence_regex, all_text)
|
sentences = re.findall(sentence_regex, all_text)
|
||||||
if sentences is not None and len(sentences) > 0:
|
if sentences is not None and len(sentences) > 0:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user