This commit is contained in:
Kovid Goyal 2017-07-08 17:49:21 +05:30
commit 32d5aa6fbb
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -37,39 +37,30 @@ class PrivateEyeRecipe(BasicNewsRecipe):
'author_sort': title_author, 'author_sort': title_author,
'smarten_punctuation': True, 'smarten_punctuation': True,
'series': title, 'series': title,
'publisher': title_author, 'publisher': title_author, }
}
remove_tags_before = [ remove_tags_before = [
{ {
'id': 'story', 'id': 'story',
'class': 'article', 'class': 'article', },
},
{ {
'id': 'page' 'id': 'page'}, ]
},
]
remove_tags_after = [ remove_tags_after = [
{ {
'class': 'section', 'class': 'section', }, ]
},
]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class': 'sub-nav-bar'}), dict(name='div', attrs={'class': 'sub-nav-bar'}),
dict(name='img', attrs={'class': 'about-covers'}), dict(name='img', attrs={'class': 'about-covers'}),
dict(name='div', attrs={'id': 'follow-us', dict(name='div', attrs={'id': 'follow-us',
'class': 'text'}), 'class': 'text'}),
dict(name='span', attrs={'class': 'section'}), dict(name='span', attrs={'class': 'section'}), ]
]
preprocess_regexps = [ preprocess_regexps = [
( (
re.compile(r'\.\./grfx', re.DOTALL | re.IGNORECASE), re.compile(r'\.\./grfx', re.DOTALL | re.IGNORECASE),
lambda match: 'http://www.private-eye.co.uk/grfx' lambda match: 'http://www.private-eye.co.uk/grfx'), ]
),
]
def fix_url(self, url): def fix_url(self, url):
if (url.startswith('//') or if (
url.startswith('http://') or url.startswith('//') or url.startswith('http://') or
url.startswith('https://')): url.startswith('https://')):
return url return url
if url.startswith('/'): if url.startswith('/'):
@ -89,14 +80,12 @@ class PrivateEyeRecipe(BasicNewsRecipe):
if url and url not in self.urls: if url and url not in self.urls:
self.urls.append(url) self.urls.append(url)
self.log.info( self.log.info(
"Page added: %s: %s: %s (%s)" % (date, title, description, url) "Page added: %s: %s: %s (%s)" % (date, title, description, url))
)
self.current_articles.append({ self.current_articles.append({
'title': title, 'title': title,
'url': url, 'url': url,
'description': description, 'description': description,
'date': date, 'date': date, })
})
def page_index_append(self, section): def page_index_append(self, section):
if self.current_articles: if self.current_articles:
@ -140,21 +129,17 @@ class PrivateEyeRecipe(BasicNewsRecipe):
day, month, year = tag_contents[2].split() day, month, year = tag_contents[2].split()
day = ''.join(c for c in day if c.isdigit()) day = ''.join(c for c in day if c.isdigit())
date = datetime.strptime( date = datetime.strptime(
" ".join((day, month, year)), " ".join((day, month, year)), "%d %B %Y")
"%d %B %Y"
)
date = date - timedelta(12) date = date - timedelta(12)
self.publication_date = datetime.strftime( self.publication_date = datetime.strftime(
date, date, "%d %B %Y").lstrip("0")
"%d %B %Y"
).lstrip("0")
self.log.debug("Publication date: %s" % self.publication_date) self.log.debug("Publication date: %s" % self.publication_date)
self.title_with_date = self.title + datetime.strftime(date, " %Y-%m-%d") self.title_with_date = self.title + datetime.strftime(
date, " %Y-%m-%d")
break break
except: except:
self.log.warning( self.log.warning(
"Invalid publication date: %s" % tag.contents[2] "Invalid publication date: %s" % tag.contents[2])
)
else: else:
self.log.warning("Publication date not found") self.log.warning("Publication date not found")
@ -235,6 +220,16 @@ It offers a unique blend of humour, social and political observations and invest
return self.page_index return self.page_index
def preprocess_html(self, soup):
for figure in soup.findAll(
'a',
attrs={'href':
lambda x: x and ('jpg' in x or 'png' in x or 'gif' in x)}):
# makes sure that the link points to the absolute web address
if figure['href'].startswith('/'):
figure['href'] = self.fix_url(figure['href'])
return soup
def postprocess_book(self, oeb, opts, log): def postprocess_book(self, oeb, opts, log):
m = oeb.metadata m = oeb.metadata
m.clear('title') m.clear('title')