mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Update Respekt
Merge branch 'master' of https://github.com/felagund/calibre
This commit is contained in:
commit
42b2eeb3bc
@ -1,37 +0,0 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import unicode_literals
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class respektRecipe(BasicNewsRecipe):
|
||||
__author__ = 'bubak'
|
||||
title = u'Respekt'
|
||||
publisher = u'Respekt'
|
||||
description = 'Respekt'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 20
|
||||
|
||||
feeds = [
|
||||
(u'Všechny články', u'http://respekt.ihned.cz/index.php?p=R00000_rss')
|
||||
,(u'Blogy', u'http://blog.respekt.ihned.cz/?p=Rb00VR_rss')
|
||||
#,(u'Respekt DJ', u'http://respekt.ihned.cz/index.php?p=R00RDJ_rss')
|
||||
]
|
||||
|
||||
|
||||
encoding = 'cp1250'
|
||||
language = 'cs'
|
||||
cover_url = 'http://respekt.ihned.cz/img/R/respekt_logo.png'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':['d-tools', 'actions']})]
|
||||
remove_tags_before = dict(name='div',attrs={'id':['detail']})
|
||||
remove_tags_after = dict(name='div',attrs={'class':'d-tools'})
|
||||
preprocess_regexps = [(re.compile(r'<div class="paid-zone".*', re.DOTALL|re.IGNORECASE), lambda match: 'Za zbytek článku je nutno platit. </body>'),
|
||||
(re.compile(r'.*<div class="mm-ow">', re.DOTALL|re.IGNORECASE), lambda match: '<body>'),
|
||||
(re.compile(r'<div class="col3">.*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
|
||||
|
||||
keep_only_tags = []
|
||||
|
||||
|
||||
|
148
recipes/respekt_magazine.recipe
Normal file
148
recipes/respekt_magazine.recipe
Normal file
@ -0,0 +1,148 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||
# Copyright: tomashnyk@gmail.com
|
||||
|
||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||
__copyright__ = 'tomashnyk@gmail.com'
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag
|
||||
#This imports the version bundled with Calibre
|
||||
import lxml
|
||||
from lxml.builder import E
|
||||
|
||||
class respektRecipe(BasicNewsRecipe):
|
||||
__author__ = u'Tomáš Hnyk'
|
||||
title = u'Respekt - Magazine'
|
||||
publisher = u'Respekt Publishing a. s.'
|
||||
description = u'Articles from the printed edition, password needed for full access'
|
||||
encoding = 'cp1250'
|
||||
language = 'cs'
|
||||
remove_javascript = True
|
||||
extra_css = 'p {text-align:justify} \
|
||||
ul {color:black} \
|
||||
.image_caption {font-size:50%;font-style:italic;} \
|
||||
.author {text-align:left;} \
|
||||
p.indent_first_line {text-indent:30px;}'
|
||||
remove_tags_before = dict(name='div',attrs={'class':['l']})
|
||||
remove_tags_after = dict(id='text')
|
||||
remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \
|
||||
dict(name='div',attrs={'class':['slot','reklama','date']}), \
|
||||
dict(name='span', attrs={'class':['detail-vykrik']}), \
|
||||
dict(name='p', attrs={'class':['detail-vykrik']}), \
|
||||
dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in preprocess requires this
|
||||
dict(name='strong', attrs={'class':['detail-vykrik']}),
|
||||
dict(name='script')]
|
||||
# this makes authors left-aligned by not using the author class)
|
||||
preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
|
||||
# remove empty tags
|
||||
preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
|
||||
preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
|
||||
preprocess_regexps.append((re.compile(r'<p></p>', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
||||
preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
||||
preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://respekt.ihned.cz/')
|
||||
cover = soup.findAll('div', attrs={'class':'cover'})[0].find('img')['src']
|
||||
return cover
|
||||
|
||||
needs_subscription = True
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser(self)
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://muj-ucet.ihned.cz/')
|
||||
br.select_form(name='login')
|
||||
br['login[nick]'] = self.username
|
||||
br['login[pass]'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def parse_index(self):
|
||||
raw = self.index_to_soup('http://respekt.ihned.cz/aktualni-cislo/', raw=True)
|
||||
root = lxml.html.fromstring(raw)
|
||||
ans = []
|
||||
for article in root.xpath("//div[@class='ow-enclose']/div[@class='ow']"):
|
||||
section_title = article.xpath(".//span[text()='(rubrika: ']")[0].find("a").text
|
||||
date = article.xpath("span[@class='date-author']")[0].text[:-3]
|
||||
title = article.find("h2").find("a").text
|
||||
url = article.find('h2').find('a').get('href')
|
||||
link = {'title':title,'url':url,'date':date}
|
||||
for section in ans:
|
||||
if section[0] == section_title:
|
||||
section[1].append(link)
|
||||
break
|
||||
else:
|
||||
ans.append((section_title,[link]))
|
||||
return ans
|
||||
|
||||
def cleanup(self):
|
||||
self.browser.open('http://muj-ucet.ihned.cz/?login[logout]=1')
|
||||
|
||||
|
||||
def preprocess_html(self,soup):
|
||||
raw = u''.join(unicode(a) for a in soup.contents)
|
||||
root = lxml.html.fromstring(raw)
|
||||
|
||||
# Make image captions visible
|
||||
body = root.xpath("//div[@id='text']")[0]
|
||||
add = 0
|
||||
for index, element in enumerate(body):
|
||||
try:
|
||||
if element.tag == 'img':
|
||||
body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"}))
|
||||
add += 1
|
||||
except:
|
||||
pass
|
||||
|
||||
# Add length of the articles in words after author
|
||||
article_length = str(len(body.text_content().split(' '))) + ' slov'
|
||||
root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length))
|
||||
|
||||
# Make perex (subheading) start on a new line
|
||||
root.xpath("//h1")[0].append(E.br(''))
|
||||
|
||||
# Indent paragraphs when typographically suitable
|
||||
parse = True
|
||||
# There are only single paragraphs in these sections
|
||||
if root.xpath("//title")[0].text == u"Deset českých zpráv, které by vás neměly minout | Deset českých zpráv - RESPEKT.IHNED.CZ":
|
||||
parse = False
|
||||
if root.xpath("//title")[0].text == u"Deset zahraničních zpráv, které by vás neměly minout | Deset světových zpráv - RESPEKT.IHNED.CZ":
|
||||
parse = False
|
||||
if parse:
|
||||
# First paragraph is never indented
|
||||
paragraphs = root.xpath('//p')
|
||||
# Clear the formatting a little bit by removing these attributes
|
||||
for par in paragraphs:
|
||||
if 'class' in par.keys():
|
||||
if par.attrib['class'] == 'detail-odstavec':
|
||||
par.attrib.pop('class')
|
||||
paragraphs.reverse()
|
||||
for par in paragraphs[:-1]:
|
||||
try:
|
||||
# <strong> in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph
|
||||
if len(par) > 0:
|
||||
if (par.text is None and par.getchildren()[0].tag == 'strong'):
|
||||
continue
|
||||
elif par.getprevious().text == u'\u2026':
|
||||
continue
|
||||
indent = False
|
||||
# Either indent if the paragraphs are the same
|
||||
if par.getprevious().attrib == par.attrib:
|
||||
indent = True
|
||||
# Or else if the first paragraph of the text was special
|
||||
if 'class' in par.getprevious().keys():
|
||||
par_name = par.getprevious().attrib['class']
|
||||
if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn':
|
||||
indent = True
|
||||
if indent:
|
||||
for key in par.keys():
|
||||
par.attrib.pop(key)
|
||||
par.attrib['class']="indent_first_line"
|
||||
except:
|
||||
pass
|
||||
|
||||
return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))
|
225
recipes/respekt_web.recipe
Normal file
225
recipes/respekt_web.recipe
Normal file
@ -0,0 +1,225 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||
# Copyright: tomashnyk@gmail.com
|
||||
|
||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||
__copyright__ = 'tomashnyk@gmail.com'
|
||||
|
||||
import re,os,datetime
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag
|
||||
from calibre.constants import config_dir, CONFIG_DIR_MODE
|
||||
#This imports the version bundled with Calibre
|
||||
import lxml
|
||||
from lxml.builder import E
|
||||
|
||||
class respektWebRecipe(BasicNewsRecipe):
|
||||
__author__ = u'Tomáš Hnyk'
|
||||
title = u'Respekt - Web'
|
||||
publisher = u'Respekt Publishing a. s.'
|
||||
description = u'Free articles from respekt.cz website'
|
||||
encoding = 'cp1250'
|
||||
language = 'cs'
|
||||
remove_javascript = True
|
||||
cover_url = 'http://respekt.ihned.cz/img/R/respekt_logo.png'
|
||||
extra_css = 'p {text-align:justify} \
|
||||
ul {color:black} \
|
||||
.image_caption {font-size:50%;font-style:italic;} \
|
||||
.author {text-align:left;} \
|
||||
p.indent_first_line {text-indent:30px;}'
|
||||
remove_tags_before = dict(name='div',attrs={'class':['l']})
|
||||
remove_tags_after = dict(id='text')
|
||||
remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \
|
||||
dict(name='div',attrs={'class':['slot','reklama','date']}), \
|
||||
dict(name='span', attrs={'class':['detail-vykrik']}), \
|
||||
dict(name='p', attrs={'class':['detail-vykrik']}), \
|
||||
dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in prprocess requires this
|
||||
dict(name='strong', attrs={'class':['detail-vykrik']}),
|
||||
dict(name='script')]
|
||||
# this makes authors left-aligned by not using the author class)
|
||||
preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
|
||||
# remove empty tags
|
||||
preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
|
||||
preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
|
||||
preprocess_regexps.append((re.compile(r'<p></p>', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
||||
preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
||||
preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
||||
|
||||
def parse_index(self):
|
||||
# Read already downloaded articles
|
||||
recipe_dir = os.path.join(config_dir,'recipes')
|
||||
old_articles = os.path.join(recipe_dir,self.title.encode('utf-8').replace('/',':'))
|
||||
past_items = []
|
||||
if os.path.exists(old_articles):
|
||||
with file(old_articles) as f:
|
||||
for h in f:
|
||||
l = h.strip().split(" ")
|
||||
past_items.append((l[0]," ".join(l[1:])))
|
||||
old_urls = [x[0] for x in past_items]
|
||||
count_items = {}
|
||||
current_items = []
|
||||
# Keep a list of only 20 latest articles for each section
|
||||
past_items.reverse()
|
||||
for item in past_items:
|
||||
if item[1] in count_items.keys():
|
||||
if count_items[item[1]] < 20:
|
||||
count_items[item[1]] += 1
|
||||
current_items.append(item)
|
||||
else:
|
||||
count_items[item[1]] = 1
|
||||
current_items.append(item)
|
||||
current_items.reverse()
|
||||
|
||||
sections = []
|
||||
# Get the webpages to download lists of articles from
|
||||
raw = self.index_to_soup('http://respekt.ihned.cz/sloupky-redaktoru/', raw=True)
|
||||
root = lxml.html.fromstring(raw)
|
||||
sections = []
|
||||
for section in root.xpath("//div[@class='ow-enclose sr']/table/tr/td"):
|
||||
try:
|
||||
url = section.find('a').get('href')
|
||||
if not ('?m=authors&person[id]=' in url):
|
||||
sections.append((url,section.find('a').find('b').text))
|
||||
except:
|
||||
pass
|
||||
sections.append(('http://respekt.ihned.cz/respekt-dj/','Respekt DJ'))
|
||||
sections.append(('http://respekt.ihned.cz/fokus/','Fokus'))
|
||||
sections.append(('http://respekt.ihned.cz/respekt-hub/','Respekt Hub'))
|
||||
sections.append(('http://respekt.ihned.cz/rozhovory/','Rozhovory'))
|
||||
sections.append(('http://respekt.ihned.cz/glosy/','Glosy'))
|
||||
|
||||
# Get the list of articles
|
||||
ans = []
|
||||
for section in sections:
|
||||
raw = self.index_to_soup(section[0], raw=True)
|
||||
root = lxml.html.fromstring(raw)
|
||||
list_of_articles = []
|
||||
articles = root.xpath("//div[@class='ow-enclose']/div[@class='ow']")
|
||||
# Sort the articles in a section from oldest to newest
|
||||
articles.reverse()
|
||||
for article in articles:
|
||||
date = getattr(article.xpath("span[@class='date-author']")[0],'text','')[:-3]
|
||||
author = getattr(article.xpath("span[@class='date-author']")[0].find("a"),'text','')
|
||||
title = getattr(article.find("h2").find("a"),'text','')
|
||||
url = article.find('h2').find('a').get('href')
|
||||
# Only download new articles
|
||||
if url not in old_urls:
|
||||
old_urls.append(url)
|
||||
current_items.append((url,section[1]))
|
||||
list_of_articles.append({'title':title,'url':url,'date':date,'author':author})
|
||||
# Redownload this page next time if it is still being updated (between 7 and 17 GMT generally, so make the limits a little bit bigger):wq
|
||||
if section[1] == 'Respekt DJ':
|
||||
if list_of_articles:
|
||||
if datetime.datetime.today().weekday() in range(0,5) and 6 < datetime.datetime.utcnow().hour < 17:
|
||||
#list_of_articles = list_of_articles[:-1]
|
||||
current_items = current_items[:-1]
|
||||
if list_of_articles:
|
||||
ans.append((section[1],list_of_articles))
|
||||
# Write already downloaded articles
|
||||
with file(old_articles,'w') as f:
|
||||
f.write('\n'.join('{} {}'.format(*x) for x in current_items))
|
||||
return ans
|
||||
|
||||
# For some reason, the following does not work:
|
||||
# preprocess_regexps.append((re.compile(r'<br/><br/>', re.DOTALL|re.IGNORECASE), lambda match: '</p><p>'))
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
return re.sub("<br /><br />","</p><p>",raw_html)
|
||||
|
||||
def preprocess_html(self,soup):
|
||||
raw = u''.join(unicode(a) for a in soup.contents)
|
||||
root = lxml.html.fromstring(raw)
|
||||
# Make image captions visible
|
||||
body = root.xpath("//div[@id='text']")[0]
|
||||
add = 0
|
||||
for index, element in enumerate(body):
|
||||
try:
|
||||
if element.tag == 'img':
|
||||
body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"}))
|
||||
add += 1
|
||||
except:
|
||||
pass
|
||||
# Make captions visible on the website have the same style
|
||||
try:
|
||||
root.xpath("//div[@class='hlavni-obrazek-popis']")[0].attrib['class'] = 'image_caption'
|
||||
except:
|
||||
pass
|
||||
# For DJ, the perex is always the same, so remove it
|
||||
if root.xpath("//title")[0].text.split("|")[-1] == u' Respekt DJ - RESPEKT.CZ':
|
||||
|
||||
perex = root.xpath("//div[@id='perex']")[0]
|
||||
clean = root.xpath("//div[@class='clean']")[0]
|
||||
perex.getparent().remove(perex)
|
||||
clean.getparent().remove(clean)
|
||||
|
||||
# DJ section gets mal-formatted on kindle otherwise
|
||||
for i in root.xpath("//h2[@class='d-dj-t']"):
|
||||
i.attrib['class'] = ''
|
||||
E.style = "font-size:60%;font-weight:normal;"
|
||||
time = E('span',i.getprevious().text_content(),style = E.style)
|
||||
# Time should be ahead of the title
|
||||
time.tail = ' ' + i.text
|
||||
i.text = ''
|
||||
i.insert(0,time)
|
||||
for i in root.xpath("//div[@class='d-dj-d']"):
|
||||
i.attrib['class'] = ''
|
||||
i.xpath("div/span")[0].text = ''
|
||||
for i in root.xpath("//div[@class='d-dj-b']"):
|
||||
i.attrib['class'] = ''
|
||||
|
||||
# Make captions visible on the website have the same style
|
||||
root.xpath("//div[@class='hlavni-obrazekDJ-popis']")[0].attrib['class'] = 'image_caption'
|
||||
|
||||
# Reverse the entries so that the earliest are at the top
|
||||
entries = root.xpath("//div[@class='d-dj-i']")
|
||||
entries.reverse()
|
||||
dj_body = entries[0].getparent()
|
||||
for entry in entries:
|
||||
dj_body.remove(entry)
|
||||
dj_body.append(entry)
|
||||
|
||||
# We are not interested in this paragraph as it stays the same and is essentialy an ad
|
||||
if root.xpath("//title")[0].text.split("|")[-1] == u' Audit Jana Macháčka - Respekt.iHNed.cz':
|
||||
ad = root.xpath("//p[@id='ajmonf']")[0]
|
||||
ad.getparent().remove(ad)
|
||||
|
||||
# Add length of the articles in words after author
|
||||
article_length = str(len(body.text_content().split(' '))) + ' slov'
|
||||
root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length))
|
||||
|
||||
# Make perex (subheading) start on a new line
|
||||
root.xpath("//h1")[0].append(E.br(''))
|
||||
|
||||
# Indent paragraphs when typographically suitable
|
||||
# First paragraph is never indented
|
||||
paragraphs = root.xpath('//p')
|
||||
# Clear the formatting a little bit by removing these attributes
|
||||
for par in paragraphs:
|
||||
if 'class' in par.keys():
|
||||
if par.attrib['class'] == 'detail-odstavec':
|
||||
par.attrib.pop('class')
|
||||
paragraphs.reverse()
|
||||
for par in paragraphs[:-1]:
|
||||
try:
|
||||
# <strong> in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph
|
||||
if len(par) > 0:
|
||||
if (par.text is None and par.getchildren()[0].tag == 'strong'):
|
||||
continue
|
||||
elif par.getprevious().text == u'\u2026':
|
||||
continue
|
||||
indent = False
|
||||
# Either indent if the paragraphs are the same
|
||||
if par.getprevious().attrib == par.attrib:
|
||||
indent = True
|
||||
# Or else if the first paragraph of the text was special
|
||||
if 'class' in par.getprevious().keys():
|
||||
par_name = par.getprevious().attrib['class']
|
||||
if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn':
|
||||
indent = True
|
||||
if indent:
|
||||
for key in par.keys():
|
||||
par.attrib.pop(key)
|
||||
par.attrib['class']="indent_first_line"
|
||||
except:
|
||||
pass
|
||||
return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))
|
Loading…
x
Reference in New Issue
Block a user