mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
8b3af7ce8e
commit
62764e1bbe
@ -1,8 +1,9 @@
|
||||
# adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>
|
||||
|
||||
import re
|
||||
import string, re
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
|
||||
|
||||
|
||||
class TheIndependentNew(BasicNewsRecipe):
|
||||
@ -49,7 +50,7 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
preprocess_regexps = [
|
||||
(re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
|
||||
lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
|
||||
(re.compile('<strong>.*?Click.*?to view graphic.*?</strong>', re.DOTALL),
|
||||
(re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
|
||||
lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
|
||||
]
|
||||
|
||||
@ -104,6 +105,14 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
#remove 'advertorial articles'
|
||||
strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
|
||||
if strapline:
|
||||
for para in strapline.findAll('p'):
|
||||
if len(para.contents) and isinstance(para.contents[0],NavigableString) \
|
||||
and para.contents[0] == 'ADVERTORIAL FEATURE':
|
||||
return None
|
||||
|
||||
items_to_extract = []
|
||||
|
||||
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
|
||||
@ -189,9 +198,14 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
|
||||
#remove empty paragraph tags in storyTop which can leave a space
|
||||
#between first paragraph and rest of story
|
||||
nested_content = False
|
||||
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
|
||||
for item in storyTop.findAll('p'):
|
||||
if item.contents is not None and len(item.contents[0]) <= 1 :
|
||||
for nested in item:
|
||||
if isinstance(nested, Tag):
|
||||
nested_content = True
|
||||
break
|
||||
if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
|
||||
items_to_extract.append(item)
|
||||
|
||||
for item in items_to_extract:
|
||||
@ -211,6 +225,8 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
items_to_insert = []
|
||||
for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
|
||||
strong = item.find('strong')
|
||||
if not strong:
|
||||
continue
|
||||
for child in strong:
|
||||
if isinstance(child,Tag):
|
||||
if str(child.name) == 'a':
|
||||
|
Loading…
x
Reference in New Issue
Block a user