diff --git a/recipes/respekt_magazine.recipe b/recipes/respekt_magazine.recipe index d4deb1d6e4..eaa66e0edb 100644 --- a/recipes/respekt_magazine.recipe +++ b/recipes/respekt_magazine.recipe @@ -12,137 +12,178 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag #This imports the version bundled with Calibre import lxml from lxml.builder import E +respekt_url = 'http://www.respekt.cz' class respektRecipe(BasicNewsRecipe): - __author__ = u'Tomáš Hnyk' - title = u'Respekt - Magazine' + __author__ = 'Tomáš Hnyk' publisher = u'Respekt Publishing a. s.' - description = u'Articles from the printed edition, password needed for full access' - encoding = 'cp1250' + description = u'Articles from the print edition' + encoding = 'utf-8' language = 'cs' remove_javascript = True - extra_css = 'p {text-align:justify} \ - ul {color:black} \ - .image_caption {font-size:50%;font-style:italic;} \ - .author {text-align:left;} \ - p.indent_first_line {text-indent:30px;}' - remove_tags_before = dict(name='div',attrs={'class':['l']}) - remove_tags_after = dict(id='text') - remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \ - dict(name='div',attrs={'class':['slot','reklama','date']}), \ - dict(name='span', attrs={'class':['detail-vykrik']}), \ - dict(name='p', attrs={'class':['detail-vykrik']}), \ - dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in preprocess requires this - dict(name='strong', attrs={'class':['detail-vykrik']}), + remove_tags_before = dict(name='h1') + remove_tags_after = [dict(id='postcontent')] + remove_tags = [dict(name='div',attrs={'id':['postsharepopup','survey-respondents']}), \ + dict(name='div',attrs={'class':['ad','ad-content','adinarticle','ad-caption','post-actions','authorship-note','quote','postgallery']}), \ + dict(name='a',attrs={'class':['quote','authorship-face']}), \ + dict(name='span',attrs={'class':'embed'}), \ + dict(name='svg'), \ dict(name='script')] - # this makes authors left-aligned by not using the author class) - preprocess_regexps = [(re.compile(r'
'))
- def preprocess_raw_html(self, raw_html, url):
- return re.sub("
","
",raw_html) - - def preprocess_html(self,soup): - raw = u''.join(unicode(a) for a in soup.contents) - root = lxml.html.fromstring(raw) - # Make image captions visible - body = root.xpath("//div[@id='text']")[0] - add = 0 - for index, element in enumerate(body): - try: - if element.tag == 'img': - body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"})) - add += 1 - except: - pass - # Make captions visible on the website have the same style - try: - root.xpath("//div[@class='hlavni-obrazek-popis']")[0].attrib['class'] = 'image_caption' - except: - pass - # For DJ, the perex is always the same, so remove it - if root.xpath("//title")[0].text.split("|")[-1] == u' Respekt DJ - RESPEKT.CZ': - - perex = root.xpath("//div[@id='perex']")[0] - clean = root.xpath("//div[@class='clean']")[0] - perex.getparent().remove(perex) - clean.getparent().remove(clean) - - # DJ section gets mal-formatted on kindle otherwise - for i in root.xpath("//h2[@class='d-dj-t']"): - i.attrib['class'] = '' - E.style = "font-size:60%;font-weight:normal;" - time = E('span',i.getprevious().text_content(),style = E.style) - # Time should be ahead of the title - time.tail = ' ' + i.text - i.text = '' - i.insert(0,time) - for i in root.xpath("//div[@class='d-dj-d']"): - i.attrib['class'] = '' - i.xpath("div/span")[0].text = '' - for i in root.xpath("//div[@class='d-dj-b']"): - i.attrib['class'] = '' - - # Make captions visible on the website have the same style - root.xpath("//div[@class='hlavni-obrazekDJ-popis']")[0].attrib['class'] = 'image_caption' - - # Reverse the entries so that the earliest are at the top - entries = root.xpath("//div[@class='d-dj-i']") - entries.reverse() - dj_body = entries[0].getparent() - for entry in entries: - dj_body.remove(entry) - dj_body.append(entry) - - # We are not interested in this paragraph as it stays the same and is essentialy an ad - if root.xpath("//title")[0].text.split("|")[-1] == u' Audit Jana Macháčka - Respekt.iHNed.cz': - ad = root.xpath("//p[@id='ajmonf']")[0] - ad.getparent().remove(ad) - - # Add length of the articles in words after author - article_length = str(len(body.text_content().split(' '))) + ' slov' - root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length)) - - # Make perex (subheading) start on a new line - root.xpath("//h1")[0].append(E.br('')) - - # Indent paragraphs when typographically suitable - # First paragraph is never indented - paragraphs = root.xpath('//p') - # Clear the formatting a little bit by removing these attributes - for par in paragraphs: - if 'class' in par.keys(): - if par.attrib['class'] == 'detail-odstavec': - par.attrib.pop('class') - paragraphs.reverse() - for par in paragraphs[:-1]: - try: - # in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph - if len(par) > 0: - if (par.text is None and par.getchildren()[0].tag == 'strong'): - continue - elif par.getprevious().text == u'\u2026': - continue - indent = False - # Either indent if the paragraphs are the same - if par.getprevious().attrib == par.attrib: - indent = True - # Or else if the first paragraph of the text was special - if 'class' in par.getprevious().keys(): - par_name = par.getprevious().attrib['class'] - if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn': - indent = True - if indent: - for key in par.keys(): - par.attrib.pop(key) - par.attrib['class']="indent_first_line" - except: - pass - return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))