mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
8b3af7ce8e
commit
62764e1bbe
@ -1,8 +1,9 @@
|
|||||||
# adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>
|
# adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>
|
||||||
|
|
||||||
import re
|
import string, re
|
||||||
|
from calibre import strftime
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
|
||||||
|
|
||||||
|
|
||||||
class TheIndependentNew(BasicNewsRecipe):
|
class TheIndependentNew(BasicNewsRecipe):
|
||||||
@ -49,7 +50,7 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
|
(re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
|
||||||
lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
|
lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
|
||||||
(re.compile('<strong>.*?Click.*?to view graphic.*?</strong>', re.DOTALL),
|
(re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
|
||||||
lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
|
lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -104,6 +105,14 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
|
||||||
|
#remove 'advertorial articles'
|
||||||
|
strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
|
||||||
|
if strapline:
|
||||||
|
for para in strapline.findAll('p'):
|
||||||
|
if len(para.contents) and isinstance(para.contents[0],NavigableString) \
|
||||||
|
and para.contents[0] == 'ADVERTORIAL FEATURE':
|
||||||
|
return None
|
||||||
|
|
||||||
items_to_extract = []
|
items_to_extract = []
|
||||||
|
|
||||||
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
|
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
|
||||||
@ -189,9 +198,14 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
|
|
||||||
#remove empty paragraph tags in storyTop which can leave a space
|
#remove empty paragraph tags in storyTop which can leave a space
|
||||||
#between first paragraph and rest of story
|
#between first paragraph and rest of story
|
||||||
|
nested_content = False
|
||||||
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
|
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
|
||||||
for item in storyTop.findAll('p'):
|
for item in storyTop.findAll('p'):
|
||||||
if item.contents is not None and len(item.contents[0]) <= 1 :
|
for nested in item:
|
||||||
|
if isinstance(nested, Tag):
|
||||||
|
nested_content = True
|
||||||
|
break
|
||||||
|
if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
|
||||||
items_to_extract.append(item)
|
items_to_extract.append(item)
|
||||||
|
|
||||||
for item in items_to_extract:
|
for item in items_to_extract:
|
||||||
@ -211,6 +225,8 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
items_to_insert = []
|
items_to_insert = []
|
||||||
for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
|
for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
|
||||||
strong = item.find('strong')
|
strong = item.find('strong')
|
||||||
|
if not strong:
|
||||||
|
continue
|
||||||
for child in strong:
|
for child in strong:
|
||||||
if isinstance(child,Tag):
|
if isinstance(child,Tag):
|
||||||
if str(child.name) == 'a':
|
if str(child.name) == 'a':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user