Make Respekt Magazine recipe work again and remove dysfunctional Respekt-Web recipe following a rewrite of the harvested website

This commit is contained in:
felagund 2016-09-13 13:15:32 +02:00
parent 159ba40d56
commit 28c733ad8f
2 changed files with 148 additions and 332 deletions

View File

@ -12,137 +12,178 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag
#This imports the version bundled with Calibre #This imports the version bundled with Calibre
import lxml import lxml
from lxml.builder import E from lxml.builder import E
respekt_url = 'http://www.respekt.cz'
class respektRecipe(BasicNewsRecipe): class respektRecipe(BasicNewsRecipe):
__author__ = u'Tomáš Hnyk' __author__ = 'Tomáš Hnyk'
title = u'Respekt - Magazine'
publisher = u'Respekt Publishing a. s.' publisher = u'Respekt Publishing a. s.'
description = u'Articles from the printed edition, password needed for full access' description = u'Articles from the print edition'
encoding = 'cp1250' encoding = 'utf-8'
language = 'cs' language = 'cs'
remove_javascript = True remove_javascript = True
extra_css = 'p {text-align:justify} \ remove_tags_before = dict(name='h1')
ul {color:black} \ remove_tags_after = [dict(id='postcontent')]
.image_caption {font-size:50%;font-style:italic;} \ remove_tags = [dict(name='div',attrs={'id':['postsharepopup','survey-respondents']}), \
.author {text-align:left;} \ dict(name='div',attrs={'class':['ad','ad-content','adinarticle','ad-caption','post-actions','authorship-note','quote','postgallery']}), \
p.indent_first_line {text-indent:30px;}' dict(name='a',attrs={'class':['quote','authorship-face']}), \
remove_tags_before = dict(name='div',attrs={'class':['l']}) dict(name='span',attrs={'class':'embed'}), \
remove_tags_after = dict(id='text') dict(name='svg'), \
remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \
dict(name='div',attrs={'class':['slot','reklama','date']}), \
dict(name='span', attrs={'class':['detail-vykrik']}), \
dict(name='p', attrs={'class':['detail-vykrik']}), \
dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in preprocess requires this
dict(name='strong', attrs={'class':['detail-vykrik']}),
dict(name='script')] dict(name='script')]
# this makes authors left-aligned by not using the author class)
preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
# remove empty tags
preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
preprocess_regexps.append((re.compile(r'<strong>&nbsp;</strong>', re.DOTALL|re.IGNORECASE), lambda match: '&nbsp;'))
preprocess_regexps.append((re.compile(r'<p></p>', re.DOTALL|re.IGNORECASE), lambda match: ''))
preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: ''))
preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: ''))
def get_cover_url(self): extra_css = 'p {text-align:justify;margin-top:0;margin-bottom:0} \
soup = self.index_to_soup('http://respekt.ihned.cz/') ul {color:black} \
cover = soup.findAll('div', attrs={'class':'cover'})[0].find('img')['src'] .frame-caption {font-weight:normal;font-size:50%;font-style:italic;} \
return cover h1 {font-size:150%;margin-bottom:0;} \
h2 {font-size:100%;margin-bottom:0;} \
.post-subtitle {margin-top:0;} \
h3 {font-size:100%;margin-bottom:0;margin-top:0;} \
.box-title {background-color: lightgray;font-size:150%;font-weight:bold;margin-left:12%;margin-right:12%;margin-top:12%;margin-bottom:0;} \
.box-content {background-color:lightgray;margin-left:12%;margin-right:12%;margin-top:0;margin-bottom:12%;} \
p.indent_first_line {text-indent:30px;} \
a {text-decoration:none;color:black;}'
needs_subscription = True needs_subscription = True
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
br.open('http://muj-ucet.ihned.cz/') br.open('https://www.respekt.cz/')
br.select_form(name='login') for form in br.forms():
br['login[nick]'] = self.username if form.attrs.get('id') == 'frm-authBox-loginForm':
br['login[pass]'] = self.password br.form = form
break
for c in form.controls:
if 'name' in c.attrs:
if c.attrs['name'] == 'username':
c.value = self.username
if c.attrs['name'] == 'password':
c.value = self.password
br.submit() br.submit()
return br return br
# So that remove_tags_before works for this section
def preprocess_raw_html(self, raw_html, url):
root = lxml.html.fromstring(raw_html)
if root.xpath("//title")[0].text == (u"Respekt • Despekt • RESPEKT"):
raw_html = re.sub("h2","h1",raw_html)
return raw_html
def parse_index(self): def parse_index(self):
raw = self.index_to_soup('http://respekt.ihned.cz/aktualni-cislo/', raw=True) raw1 = self.index_to_soup('http://www.respekt.cz/tydenik/', raw=True)
root = lxml.html.fromstring(raw) root1 = lxml.html.fromstring(raw1)
current_edition_url = root1.xpath("//div[@class='heroissue']/a")[0].items()[0][1]
raw2 = self.index_to_soup('http://www.respekt.cz/' + current_edition_url, raw=True)
root2 = lxml.html.fromstring(raw2)
self.cover_url = root2.xpath("//i[contains(@class, 'heroissue-cover')]")[0].get("data-src")
# Fetch date
date_text = root2.xpath("//time[@class='heroissue-date']")[0].text.split(',')[1]
s = date_text.split(" ")
# Are the dates of the issue in the same month and year?
if len(s) == 4 or len(s) == 7:
date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[-1]])
elif len(s) == 8:
date = "/".join([s[1].split(".")[0],s[2].split(".")[0],s[3]])
self.conversion_options = {'pubdate':date}
self.title = "Respekt magazine #" + "/".join(current_edition_url.split("/")[-1:-3:-1])
ans = [] ans = []
for article in root.xpath("//div[@class='ow-enclose']/div[@class='ow']"): for section in root2.xpath("//div[@class='col-md-6']/div[@class='issuedetail-categorized-sectionname']"):
section_title = article.xpath(".//span[text()='(rubrika: ']")[0].find("a").text section_name = section.text
date = article.xpath("span[@class='date-author']")[0].text[:-3] articles = []
title = article.find("h2").find("a").text article = section.getnext()
url = article.find('h2').find('a').get('href') while hasattr(article, 'text') and not article.text.strip():
link = {'title':title,'url':url,'date':date} title = article.xpath("span[@class='issuedetail-categorized-title']")[0].text
for section in ans: url = respekt_url + article.xpath("@href")[0]
if section[0] == section_title: articles.append({'title':title,'url':url})
section[1].append(link) article = article.getnext()
break ans.append((section_name,articles))
highlights = zip(root2.xpath("//a[@class='issuedetail-highlighted-item']"),root2.xpath("//div[@class='issuedetail-highlighted-title']"))
highlights.reverse()
sections = [i[0] for i in ans]
for l,t in highlights:
title = t.text
link = l.xpath('@href')[0]
raw3 = self.index_to_soup(respekt_url + link, raw=True)
root3 = lxml.html.fromstring(raw3)
topics = [i.text.strip() for i in root3.xpath("//div[contains(@class, 'post-topics')]/a")]
# The name of the section changes its position
if u"Téma" in topics:
section_name = "Fokus"
elif u"Rozhovor" in topics:
section_name = "Rozhovor"
else: else:
ans.append((section_title,[link])) for t in topics:
if t in sections:
section_name = t
break
for i in ans:
if i[0] == section_name:
i[1].insert(-(len(i[1])),{'title':title,'url':respekt_url+link})
if section_name == u"Rozhovor":
ans.insert(sections.index(u'Fokus')+1,(section_name,[{'title':title,'url':respekt_url+link}]))
return ans return ans
def cleanup(self): def cleanup(self):
self.browser.open('http://muj-ucet.ihned.cz/?login[logout]=1') self.browser.open('https://www.respekt.cz/?do=logout')
def preprocess_html(self,soup): def preprocess_html(self,soup):
raw = u''.join(unicode(a) for a in soup.contents) raw = u''.join(unicode(a) for a in soup.contents)
root = lxml.html.fromstring(raw) root = lxml.html.fromstring(raw)
#Fix Letem světem
# Make image captions visible if "Letem sv" in root.xpath("//title")[0].text:
body = root.xpath("//div[@id='text']")[0] p = root.xpath("//p")
add = 0 for par in p[:]:
for index, element in enumerate(body): next = par.getnext()
try: if par.getchildren():
if element.tag == 'img': child = par.getchildren()[0]
body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"})) if hasattr(next,"tag") and next.tag == "h2" and hasattr(child,"tag") and child.tag == "strong":
add += 1 text = child.text_content()
except: if next.text:
pass next.text = next.text + u" • " + text
else:
# Add length of the articles in words after author if next.getchildren():
article_length = str(len(body.text_content().split(' '))) + ' slov' next_child = next.getchildren()[0]
root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length)) next_child.text = next_child.text + u" • " + text
par.getparent().remove(par)
# Make perex (subheading) start on a new line # Insert text length
root.xpath("//h1")[0].append(E.br('')) text = root.xpath("//div[@id='postcontent']")[0]
article_length = u" • " + str(len(text.text_content().split(' '))) + ' slov'
# Indent paragraphs when typographically suitable try:
parse = True aut = root.xpath("//div[@class='authorship-names']")[0]
# There are only single paragraphs in these sections if aut.getchildren() and aut.getchildren()[0].tag == 'a':
if root.xpath("//title")[0].text == u"Deset českých zpráv, které by vás neměly minout | Deset českých zpráv - RESPEKT.IHNED.CZ": t = aut.getchildren()[0]
parse = False t.text = 'Autor: ' + t.text + ' '
if root.xpath("//title")[0].text == u"Deset zahraničních zpráv, které by vás neměly minout | Deset světových zpráv - RESPEKT.IHNED.CZ": # Remove link
parse = False e = E.span(t.text)
if parse: t.getparent().replace(t,e)
# First paragraph is never indented else:
paragraphs = root.xpath('//p') t = root.xpath("//span[@class='post-author-name']")[0]
# Clear the formatting a little bit by removing these attributes t.text = ('Autor: ' + t.text + ' ')
for par in paragraphs: root.xpath("//div[@class='authorship-names']")[0].append(E.span(article_length))
if 'class' in par.keys(): except:
if par.attrib['class'] == 'detail-odstavec': pass
par.attrib.pop('class') # Make images visible
paragraphs.reverse() pictures = root.xpath("//picture")
for par in paragraphs[:-1]: for picture in pictures:
try: image = picture.xpath("//source")[0]
# <strong> in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph image_link = [a for a in image.get('srcset').split(' ') if a[:4] == "http"][-1]
if len(par) > 0: e=E.img({"src":image_link})
if (par.text is None and par.getchildren()[0].tag == 'strong'): picture.getparent().replace(picture,e)
continue # Properly indent
elif par.getprevious().text == u'\u2026': paragraphs = root.xpath('//p')
continue paragraphs.reverse()
indent = False # First paragraph is never indented
# Either indent if the paragraphs are the same for par in paragraphs[:-1]:
if par.getprevious().attrib == par.attrib: prev = par.getprevious()
indent = True # Do not indent after headings
# Or else if the first paragraph of the text was special if hasattr(prev,'tag') and not (prev.tag in ['h2','h3']):
if 'class' in par.getprevious().keys(): par.attrib['class']="indent_first_line"
par_name = par.getprevious().attrib['class'] # Fix subtitle for Téma
if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn': try:
indent = True o = root.xpath("//p[@class='post-perex']")[0]
if indent: e = E.h2({"class":"post-subtitle"})
for key in par.keys(): e.text = o.text
par.attrib.pop(key) o.getparent().replace(o,e)
par.attrib['class']="indent_first_line" except:
except: pass
pass
return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode))) return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))

View File

@ -1,225 +0,0 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Copyright: tomashnyk@gmail.com
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = 'tomashnyk@gmail.com'
import re,os,datetime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag
from calibre.constants import config_dir, CONFIG_DIR_MODE
#This imports the version bundled with Calibre
import lxml
from lxml.builder import E
class respektWebRecipe(BasicNewsRecipe):
__author__ = u'Tomáš Hnyk'
title = u'Respekt - Web'
publisher = u'Respekt Publishing a. s.'
description = u'Free articles from respekt.cz website'
encoding = 'cp1250'
language = 'cs'
remove_javascript = True
cover_url = 'http://respekt.ihned.cz/img/R/respekt_logo.png'
extra_css = 'p {text-align:justify} \
ul {color:black} \
.image_caption {font-size:50%;font-style:italic;} \
.author {text-align:left;} \
p.indent_first_line {text-indent:30px;}'
remove_tags_before = dict(name='div',attrs={'class':['l']})
remove_tags_after = dict(id='text')
remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \
dict(name='div',attrs={'class':['slot','reklama','date']}), \
dict(name='span', attrs={'class':['detail-vykrik']}), \
dict(name='p', attrs={'class':['detail-vykrik']}), \
dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in prprocess requires this
dict(name='strong', attrs={'class':['detail-vykrik']}),
dict(name='script')]
# this makes authors left-aligned by not using the author class)
preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
# remove empty tags
preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
preprocess_regexps.append((re.compile(r'<strong>&nbsp;</strong>', re.DOTALL|re.IGNORECASE), lambda match: '&nbsp;'))
preprocess_regexps.append((re.compile(r'<p></p>', re.DOTALL|re.IGNORECASE), lambda match: ''))
preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: ''))
preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: ''))
def parse_index(self):
# Read already downloaded articles
recipe_dir = os.path.join(config_dir,'recipes')
old_articles = os.path.join(recipe_dir,self.title.encode('utf-8').replace('/',':'))
past_items = []
if os.path.exists(old_articles):
with file(old_articles) as f:
for h in f:
l = h.strip().split(" ")
past_items.append((l[0]," ".join(l[1:])))
old_urls = [x[0] for x in past_items]
count_items = {}
current_items = []
# Keep a list of only 20 latest articles for each section
past_items.reverse()
for item in past_items:
if item[1] in count_items.keys():
if count_items[item[1]] < 20:
count_items[item[1]] += 1
current_items.append(item)
else:
count_items[item[1]] = 1
current_items.append(item)
current_items.reverse()
sections = []
# Get the webpages to download lists of articles from
raw = self.index_to_soup('http://respekt.ihned.cz/sloupky-redaktoru/', raw=True)
root = lxml.html.fromstring(raw)
sections = []
for section in root.xpath("//div[@class='ow-enclose sr']/table/tr/td"):
try:
url = section.find('a').get('href')
if not ('?m=authors&person[id]=' in url):
sections.append((url,section.find('a').find('b').text))
except:
pass
sections.append(('http://respekt.ihned.cz/respekt-dj/','Respekt DJ'))
sections.append(('http://respekt.ihned.cz/fokus/','Fokus'))
sections.append(('http://respekt.ihned.cz/respekt-hub/','Respekt Hub'))
sections.append(('http://respekt.ihned.cz/rozhovory/','Rozhovory'))
sections.append(('http://respekt.ihned.cz/glosy/','Glosy'))
# Get the list of articles
ans = []
for section in sections:
raw = self.index_to_soup(section[0], raw=True)
root = lxml.html.fromstring(raw)
list_of_articles = []
articles = root.xpath("//div[@class='ow-enclose']/div[@class='ow']")
# Sort the articles in a section from oldest to newest
articles.reverse()
for article in articles:
date = getattr(article.xpath("span[@class='date-author']")[0],'text','')[:-3]
author = getattr(article.xpath("span[@class='date-author']")[0].find("a"),'text','')
title = getattr(article.find("h2").find("a"),'text','')
url = article.find('h2').find('a').get('href')
# Only download new articles
if url not in old_urls:
old_urls.append(url)
current_items.append((url,section[1]))
list_of_articles.append({'title':title,'url':url,'date':date,'author':author})
# Redownload this page next time if it is still being updated (between 7 and 17 GMT generally, so make the limits a little bit bigger):wq
if section[1] == 'Respekt DJ':
if list_of_articles:
if datetime.datetime.today().weekday() in range(0,5) and 6 < datetime.datetime.utcnow().hour < 17:
#list_of_articles = list_of_articles[:-1]
current_items = current_items[:-1]
if list_of_articles:
ans.append((section[1],list_of_articles))
# Write already downloaded articles
with file(old_articles,'w') as f:
f.write('\n'.join('{} {}'.format(*x) for x in current_items))
return ans
# For some reason, the following does not work:
# preprocess_regexps.append((re.compile(r'<br/><br/>', re.DOTALL|re.IGNORECASE), lambda match: '</p><p>'))
def preprocess_raw_html(self, raw_html, url):
return re.sub("<br /><br />","</p><p>",raw_html)
def preprocess_html(self,soup):
raw = u''.join(unicode(a) for a in soup.contents)
root = lxml.html.fromstring(raw)
# Make image captions visible
body = root.xpath("//div[@id='text']")[0]
add = 0
for index, element in enumerate(body):
try:
if element.tag == 'img':
body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"}))
add += 1
except:
pass
# Make captions visible on the website have the same style
try:
root.xpath("//div[@class='hlavni-obrazek-popis']")[0].attrib['class'] = 'image_caption'
except:
pass
# For DJ, the perex is always the same, so remove it
if root.xpath("//title")[0].text.split("|")[-1] == u' Respekt DJ - RESPEKT.CZ':
perex = root.xpath("//div[@id='perex']")[0]
clean = root.xpath("//div[@class='clean']")[0]
perex.getparent().remove(perex)
clean.getparent().remove(clean)
# DJ section gets mal-formatted on kindle otherwise
for i in root.xpath("//h2[@class='d-dj-t']"):
i.attrib['class'] = ''
E.style = "font-size:60%;font-weight:normal;"
time = E('span',i.getprevious().text_content(),style = E.style)
# Time should be ahead of the title
time.tail = ' ' + i.text
i.text = ''
i.insert(0,time)
for i in root.xpath("//div[@class='d-dj-d']"):
i.attrib['class'] = ''
i.xpath("div/span")[0].text = ''
for i in root.xpath("//div[@class='d-dj-b']"):
i.attrib['class'] = ''
# Make captions visible on the website have the same style
root.xpath("//div[@class='hlavni-obrazekDJ-popis']")[0].attrib['class'] = 'image_caption'
# Reverse the entries so that the earliest are at the top
entries = root.xpath("//div[@class='d-dj-i']")
entries.reverse()
dj_body = entries[0].getparent()
for entry in entries:
dj_body.remove(entry)
dj_body.append(entry)
# We are not interested in this paragraph as it stays the same and is essentialy an ad
if root.xpath("//title")[0].text.split("|")[-1] == u' Audit Jana Macháčka - Respekt.iHNed.cz':
ad = root.xpath("//p[@id='ajmonf']")[0]
ad.getparent().remove(ad)
# Add length of the articles in words after author
article_length = str(len(body.text_content().split(' '))) + ' slov'
root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length))
# Make perex (subheading) start on a new line
root.xpath("//h1")[0].append(E.br(''))
# Indent paragraphs when typographically suitable
# First paragraph is never indented
paragraphs = root.xpath('//p')
# Clear the formatting a little bit by removing these attributes
for par in paragraphs:
if 'class' in par.keys():
if par.attrib['class'] == 'detail-odstavec':
par.attrib.pop('class')
paragraphs.reverse()
for par in paragraphs[:-1]:
try:
# <strong> in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph
if len(par) > 0:
if (par.text is None and par.getchildren()[0].tag == 'strong'):
continue
elif par.getprevious().text == u'\u2026':
continue
indent = False
# Either indent if the paragraphs are the same
if par.getprevious().attrib == par.attrib:
indent = True
# Or else if the first paragraph of the text was special
if 'class' in par.getprevious().keys():
par_name = par.getprevious().attrib['class']
if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn':
indent = True
if indent:
for key in par.keys():
par.attrib.pop(key)
par.attrib['class']="indent_first_line"
except:
pass
return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))