mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Merge branch 'patch-12' of https://github.com/Sophist-UK/calibre
This commit is contained in:
commit
32d5aa6fbb
@ -37,39 +37,30 @@ class PrivateEyeRecipe(BasicNewsRecipe):
|
|||||||
'author_sort': title_author,
|
'author_sort': title_author,
|
||||||
'smarten_punctuation': True,
|
'smarten_punctuation': True,
|
||||||
'series': title,
|
'series': title,
|
||||||
'publisher': title_author,
|
'publisher': title_author, }
|
||||||
}
|
|
||||||
remove_tags_before = [
|
remove_tags_before = [
|
||||||
{
|
{
|
||||||
'id': 'story',
|
'id': 'story',
|
||||||
'class': 'article',
|
'class': 'article', },
|
||||||
},
|
|
||||||
{
|
{
|
||||||
'id': 'page'
|
'id': 'page'}, ]
|
||||||
},
|
|
||||||
]
|
|
||||||
remove_tags_after = [
|
remove_tags_after = [
|
||||||
{
|
{
|
||||||
'class': 'section',
|
'class': 'section', }, ]
|
||||||
},
|
|
||||||
]
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div', attrs={'class': 'sub-nav-bar'}),
|
dict(name='div', attrs={'class': 'sub-nav-bar'}),
|
||||||
dict(name='img', attrs={'class': 'about-covers'}),
|
dict(name='img', attrs={'class': 'about-covers'}),
|
||||||
dict(name='div', attrs={'id': 'follow-us',
|
dict(name='div', attrs={'id': 'follow-us',
|
||||||
'class': 'text'}),
|
'class': 'text'}),
|
||||||
dict(name='span', attrs={'class': 'section'}),
|
dict(name='span', attrs={'class': 'section'}), ]
|
||||||
]
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(
|
(
|
||||||
re.compile(r'\.\./grfx', re.DOTALL | re.IGNORECASE),
|
re.compile(r'\.\./grfx', re.DOTALL | re.IGNORECASE),
|
||||||
lambda match: 'http://www.private-eye.co.uk/grfx'
|
lambda match: 'http://www.private-eye.co.uk/grfx'), ]
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
def fix_url(self, url):
|
def fix_url(self, url):
|
||||||
if (url.startswith('//') or
|
if (
|
||||||
url.startswith('http://') or
|
url.startswith('//') or url.startswith('http://') or
|
||||||
url.startswith('https://')):
|
url.startswith('https://')):
|
||||||
return url
|
return url
|
||||||
if url.startswith('/'):
|
if url.startswith('/'):
|
||||||
@ -89,14 +80,12 @@ class PrivateEyeRecipe(BasicNewsRecipe):
|
|||||||
if url and url not in self.urls:
|
if url and url not in self.urls:
|
||||||
self.urls.append(url)
|
self.urls.append(url)
|
||||||
self.log.info(
|
self.log.info(
|
||||||
"Page added: %s: %s: %s (%s)" % (date, title, description, url)
|
"Page added: %s: %s: %s (%s)" % (date, title, description, url))
|
||||||
)
|
|
||||||
self.current_articles.append({
|
self.current_articles.append({
|
||||||
'title': title,
|
'title': title,
|
||||||
'url': url,
|
'url': url,
|
||||||
'description': description,
|
'description': description,
|
||||||
'date': date,
|
'date': date, })
|
||||||
})
|
|
||||||
|
|
||||||
def page_index_append(self, section):
|
def page_index_append(self, section):
|
||||||
if self.current_articles:
|
if self.current_articles:
|
||||||
@ -140,21 +129,17 @@ class PrivateEyeRecipe(BasicNewsRecipe):
|
|||||||
day, month, year = tag_contents[2].split()
|
day, month, year = tag_contents[2].split()
|
||||||
day = ''.join(c for c in day if c.isdigit())
|
day = ''.join(c for c in day if c.isdigit())
|
||||||
date = datetime.strptime(
|
date = datetime.strptime(
|
||||||
" ".join((day, month, year)),
|
" ".join((day, month, year)), "%d %B %Y")
|
||||||
"%d %B %Y"
|
|
||||||
)
|
|
||||||
date = date - timedelta(12)
|
date = date - timedelta(12)
|
||||||
self.publication_date = datetime.strftime(
|
self.publication_date = datetime.strftime(
|
||||||
date,
|
date, "%d %B %Y").lstrip("0")
|
||||||
"%d %B %Y"
|
|
||||||
).lstrip("0")
|
|
||||||
self.log.debug("Publication date: %s" % self.publication_date)
|
self.log.debug("Publication date: %s" % self.publication_date)
|
||||||
self.title_with_date = self.title + datetime.strftime(date, " %Y-%m-%d")
|
self.title_with_date = self.title + datetime.strftime(
|
||||||
|
date, " %Y-%m-%d")
|
||||||
break
|
break
|
||||||
except:
|
except:
|
||||||
self.log.warning(
|
self.log.warning(
|
||||||
"Invalid publication date: %s" % tag.contents[2]
|
"Invalid publication date: %s" % tag.contents[2])
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
self.log.warning("Publication date not found")
|
self.log.warning("Publication date not found")
|
||||||
|
|
||||||
@ -235,6 +220,16 @@ It offers a unique blend of humour, social and political observations and invest
|
|||||||
|
|
||||||
return self.page_index
|
return self.page_index
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for figure in soup.findAll(
|
||||||
|
'a',
|
||||||
|
attrs={'href':
|
||||||
|
lambda x: x and ('jpg' in x or 'png' in x or 'gif' in x)}):
|
||||||
|
# makes sure that the link points to the absolute web address
|
||||||
|
if figure['href'].startswith('/'):
|
||||||
|
figure['href'] = self.fix_url(figure['href'])
|
||||||
|
return soup
|
||||||
|
|
||||||
def postprocess_book(self, oeb, opts, log):
|
def postprocess_book(self, oeb, opts, log):
|
||||||
m = oeb.metadata
|
m = oeb.metadata
|
||||||
m.clear('title')
|
m.clear('title')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user