', re.DOTALL|re.IGNORECASE), lambda match: '

')] # remove empty tags preprocess_regexps.append((re.compile(r' ', re.DOTALL|re.IGNORECASE), lambda match: ' ')) preprocess_regexps.append((re.compile(r' ', re.DOTALL|re.IGNORECASE), lambda match: ' ')) preprocess_regexps.append((re.compile(r'

', re.DOTALL|re.IGNORECASE), lambda match: '')) preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: '')) preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: '')) def parse_index(self): # Read already downloaded articles recipe_dir = os.path.join(config_dir,'recipes') old_articles = os.path.join(recipe_dir,self.title) past_items = [] if os.path.exists(old_articles): with file(old_articles) as f: for h in f: l = h.strip().split(" ") past_items.append((l[0]," ".join(l[1:]))) old_urls = [x[0] for x in past_items] count_items = {} current_items = [] # Keep a list of only 20 latest articles for each section past_items.reverse() for item in past_items: if item[1] in count_items.keys(): if count_items[item[1]] < 20: count_items[item[1]] += 1 current_items.append(item) else: count_items[item[1]] = 1 current_items.append(item) current_items.reverse() sections = [] # Get the webpages to download lists of articles from raw = self.index_to_soup('http://respekt.ihned.cz/sloupky-redaktoru/', raw=True) root = lxml.html.fromstring(raw) sections = [] for section in root.xpath("//div[@class='ow-enclose sr']/table/tr/td"): try: url = section.find('a').get('href') if not ('?m=authors&person[id]=' in url): sections.append((url,section.find('a').find('b').text)) except: pass sections.append(('http://respekt.ihned.cz/respekt-dj/','Respekt DJ')) sections.append(('http://respekt.ihned.cz/fokus/','Fokus')) sections.append(('http://respekt.ihned.cz/respekt-hub/','Respekt Hub')) sections.append(('http://respekt.ihned.cz/rozhovory/','Rozhovory')) sections.append(('http://respekt.ihned.cz/glosy/','Glosy')) # Get the list of articles ans = [] for section in sections: raw = self.index_to_soup(section[0], raw=True) root = lxml.html.fromstring(raw) list_of_articles = [] articles = root.xpath("//div[@class='ow-enclose']/div[@class='ow']") # Sort the articles in a section from oldest to newest articles.reverse() for article in articles: date = getattr(article.xpath("span[@class='date-author']")[0],'text','')[:-3] author = getattr(article.xpath("span[@class='date-author']")[0].find("a"),'text','') title = getattr(article.find("h2").find("a"),'text','') url = article.find('h2').find('a').get('href') # Only download new articles if url not in old_urls: old_urls.append(url) current_items.append((url,section[1])) list_of_articles.append({'title':title,'url':url,'date':date,'author':author}) # Redownload this page next time if it is still being updated (between 7 and 17 GMT generally, so make the limits a little bit bigger):wq if section[1] == 'Respekt DJ': if list_of_articles: if datetime.datetime.today().weekday() in range(0,5) and 6 < datetime.datetime.utcnow().hour < 17: # list_of_articles = list_of_articles[:-1] current_items = current_items[:-1] if list_of_articles: ans.append((section[1],list_of_articles)) # Write already downloaded articles with file(old_articles,'w') as f: f.write('\n'.join('{} {}'.format(*x) for x in current_items)) return ans # For some reason, the following does not work: # preprocess_regexps.append((re.compile(r'

', re.DOTALL|re.IGNORECASE), lambda match: '

')) def preprocess_raw_html(self, raw_html, url): return re.sub("

","

",raw_html) def preprocess_html(self,soup): raw = u''.join(unicode(a) for a in soup.contents) root = lxml.html.fromstring(raw) # Make image captions visible body = root.xpath("//div[@id='text']")[0] add = 0 for index, element in enumerate(body): try: if element.tag == 'img': body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"})) add += 1 except: pass # Make captions visible on the website have the same style try: root.xpath("//div[@class='hlavni-obrazek-popis']")[0].attrib['class'] = 'image_caption' except: pass # For DJ, the perex is always the same, so remove it if root.xpath("//title")[0].text.split("|")[-1] == u' Respekt DJ - RESPEKT.CZ': perex = root.xpath("//div[@id='perex']")[0] clean = root.xpath("//div[@class='clean']")[0] perex.getparent().remove(perex) clean.getparent().remove(clean) # DJ section gets mal-formatted on kindle otherwise for i in root.xpath("//h2[@class='d-dj-t']"): i.attrib['class'] = '' E.style = "font-size:60%;font-weight:normal;" time = E('span',i.getprevious().text_content(),style=E.style) # Time should be ahead of the title time.tail = ' ' + i.text i.text = '' i.insert(0,time) for i in root.xpath("//div[@class='d-dj-d']"): i.attrib['class'] = '' i.xpath("div/span")[0].text = '' for i in root.xpath("//div[@class='d-dj-b']"): i.attrib['class'] = '' # Make captions visible on the website have the same style root.xpath("//div[@class='hlavni-obrazekDJ-popis']")[0].attrib['class'] = 'image_caption' # Reverse the entries so that the earliest are at the top entries = root.xpath("//div[@class='d-dj-i']") entries.reverse() dj_body = entries[0].getparent() for entry in entries: dj_body.remove(entry) dj_body.append(entry) # We are not interested in this paragraph as it stays the same and is essentialy an ad if root.xpath("//title")[0].text.split("|")[-1] == u' Audit Jana Macháčka - Respekt.iHNed.cz': ad = root.xpath("//p[@id='ajmonf']")[0] ad.getparent().remove(ad) # Add length of the articles in words after author article_length = str(len(body.text_content().split(' '))) + ' slov' root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length)) # Make perex (subheading) start on a new line root.xpath("//h1")[0].append(E.br('')) # Indent paragraphs when typographically suitable # First paragraph is never indented paragraphs = root.xpath('//p') # Clear the formatting a little bit by removing these attributes for par in paragraphs: if 'class' in par.keys(): if par.attrib['class'] == 'detail-odstavec': par.attrib.pop('class') paragraphs.reverse() for par in paragraphs[:-1]: try: # in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph if len(par) > 0: if (par.text is None and par.getchildren()[0].tag == 'strong'): continue elif par.getprevious().text == u'\u2026': continue indent = False # Either indent if the paragraphs are the same if par.getprevious().attrib == par.attrib: indent = True # Or else if the first paragraph of the text was special if 'class' in par.getprevious().keys(): par_name = par.getprevious().attrib['class'] if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn': indent = True if indent: for key in par.keys(): par.attrib.pop(key) par.attrib['class']="indent_first_line" except: pass return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))