mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-30 18:22:25 -04:00 
			
		
		
		
	Hurriyet Daily News by spswerling
This commit is contained in:
		
							parent
							
								
									2b23b0d342
								
							
						
					
					
						commit
						92cfab55a1
					
				
							
								
								
									
										260
									
								
								recipes/hurriyet_daily_news.recipe
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										260
									
								
								recipes/hurriyet_daily_news.recipe
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,260 @@ | ||||
| #!/usr/bin/env  python | ||||
| # -*- coding: utf-8 -*- | ||||
| __license__   = 'GPL v3' | ||||
| __copyright__ = '2014, spswerling' | ||||
| ''' | ||||
| www.hurriyetdailynews.com | ||||
| ''' | ||||
| import os, string, inspect, datetime, re | ||||
| from calibre.web.feeds.news import BasicNewsRecipe | ||||
| from calibre.ebooks.BeautifulSoup import BeautifulSoup | ||||
| 
 | ||||
| class HurriyetDailyNews_en(BasicNewsRecipe): | ||||
|     title          = u'Hurriyet Daily News' | ||||
|     __author__            = u'spswerling' | ||||
|     description            = 'a Turkey based daily in english' | ||||
|     description = 'English version of Turkish Daily "Hurriyet"' | ||||
|     no_stylesheets         = True | ||||
|     encoding               = 'utf-8' | ||||
|     category               = 'news' | ||||
|     language               = 'en_TR' | ||||
|     publication_type = 'newspaper' | ||||
|     cover_img_url = 'http://www.hurriyetdailynews.com/images/design/logo-hurriyet-daily-news.png' | ||||
|     masthead_url = cover_img_url | ||||
|     remove_empty_feeds = True | ||||
| 
 | ||||
|     # on kindle, images can make things kind of fat. Slim them down. | ||||
|     recursions = 0 | ||||
|     oldest_article = 1 | ||||
|     compress_news_images = True | ||||
|     compress_news_images_max_size = 7 | ||||
|     scale_news_images = (150,200)  # (kindle touch: 600x800) | ||||
|     useHighResImages = False | ||||
|     oldest_article = 1.5 | ||||
|     max_articles_per_section = 25 | ||||
|     max_articles_per_subsection = 7 | ||||
| 
 | ||||
|     sections = [ | ||||
|                  u'turkey', | ||||
|                  u'economy', | ||||
|                  u'world', | ||||
|                  u'sports', | ||||
|                  # u'life', | ||||
|                  u'opinion', | ||||
|                  # u'arts/culture' | ||||
|                ] | ||||
| 
 | ||||
|     # util for creating remove_tags and keep_tags style regex matchers | ||||
|     def tag_matcher(elt, attr, str): | ||||
|         return dict(name=elt, attrs={attr:re.compile(str, re.IGNORECASE)}) | ||||
| 
 | ||||
|     keep_only_tags = [tag_matcher('div', 'class', 'NewsDetail')] | ||||
| 
 | ||||
|     remove_tags = [ | ||||
|         tag_matcher('div', 'class', 'Carousel'), | ||||
|         tag_matcher('div', 'class', 'ShareIt'), | ||||
|         tag_matcher('div', 'class', 'tmz'), | ||||
|         tag_matcher('span', 'id', 'comment'), | ||||
|         tag_matcher('h2', 'class', 'NewSpot'), | ||||
|         tag_matcher('h2', 'class', 'pv-gallery'), | ||||
|         ] | ||||
| 
 | ||||
|     articles = {} | ||||
|     subsection_links = {} | ||||
|     urls_done = [] | ||||
|     links_per_section = {} | ||||
| 
 | ||||
|     def parse_index(self): | ||||
|         section_links = self.section_links_from_home_page() | ||||
|         for section_link in section_links: | ||||
|             self.articles[self.section_name(section_link)] = [] | ||||
|             subsection_links = self.find_subsection_links(section_link) | ||||
|             for subsection_link in subsection_links: | ||||
|                 sub_name = self.subsection_name(subsection_link) | ||||
|                 self.subsection_links[sub_name] = [] | ||||
|                 self.parse_subsection(section_link, subsection_link) | ||||
|         ans = [] | ||||
|         for k in self.articles: | ||||
|             ans.append((string.capwords(k), self.articles[k])) | ||||
|         return ans | ||||
| 
 | ||||
|     def section_links_from_home_page(self): | ||||
| 
 | ||||
|         def include_link(link): | ||||
|             return self.text(link).lower() in self.sections | ||||
| 
 | ||||
|         url = 'http://www.hurriyetdailynews.com/' | ||||
|         try: | ||||
|             self._p('hitting home page ' + url) | ||||
|             soup = self.index_to_soup(url) | ||||
|         except: | ||||
|             self._p('Unable to spider home page') | ||||
|             return [] | ||||
| 
 | ||||
|         self._p('Got home page. hunt down section links.') | ||||
| 
 | ||||
|         regex = re.compile('rmRootLink', re.IGNORECASE) | ||||
|         links = soup.findAll('a', {'class':regex}) | ||||
| 
 | ||||
|         filtered_links = filter(include_link, links) | ||||
|         self._p(' all sections: ' + ', '.join(map(self.text, links))) | ||||
|         self._p(' filtered sections: ' + | ||||
|                 ', '.join(map(self.text, filtered_links))) | ||||
| 
 | ||||
|         return filtered_links | ||||
| 
 | ||||
|     def find_subsection_links(self, section_link): | ||||
|         self._p('find subsection links for section  ' + str(section_link)) | ||||
|         url = self.abs_url(section_link['href']) | ||||
|         try: | ||||
|             self._p('hitting ' + url) | ||||
|             soup = self.index_to_soup(url) | ||||
|         except: | ||||
|             self._p('Unable to spider subsection') | ||||
|             return [] | ||||
|         self._p('Got ' + url) | ||||
| 
 | ||||
|         div = soup.find('div', {'class':'SeffafLink'}) | ||||
|         if not div: | ||||
|             self._p('could not find any subsections') | ||||
|             return [section_link] | ||||
|         links = div.findAll('a') | ||||
|         self._p(' subsection links: ' + ', '.join(map(self.text, links))) | ||||
|         return links | ||||
| 
 | ||||
|     def parse_subsection(self, section_link, subsection_link): | ||||
| 
 | ||||
|         section = self.section_name(section_link) | ||||
|         if len(self.articles[section]) > self.max_articles_per_section: | ||||
|             return | ||||
| 
 | ||||
|         # tmp dbg | ||||
|         # if not self.subsection_name(subsection_link) == 'arts': | ||||
|         #    return | ||||
| 
 | ||||
|         self._p('hit section  ' + section + | ||||
|            ', subsect ' +  self.subsection_name(subsection_link)) | ||||
|         url = self.abs_url(subsection_link['href']) | ||||
|         try: | ||||
|             self._p('hitting ' + url) | ||||
|             soup = self.index_to_soup(url) | ||||
|         except: | ||||
|             self._p('Unable to spider section') | ||||
|             return [] | ||||
| 
 | ||||
|         self._p('Process  links ') | ||||
|         for link in soup.findAll('a'): | ||||
|             if 'NewsDetail' in str(link.get('id')): | ||||
|                 self.process_link(section_link, subsection_link, link) | ||||
| 
 | ||||
|     def process_link(self, section_link, subsection_link, link): | ||||
|         section = self.section_name(section_link) | ||||
|         subsection = self.subsection_name(subsection_link) | ||||
|         title = link['title'] or self.text(link) | ||||
|         href = link.get('href') | ||||
|         if not href: | ||||
|             self._p("BAD HREF: " + str(link)) | ||||
|             return | ||||
|         self.queue_article_link(section, subsection, href, title) | ||||
| 
 | ||||
|     def queue_article_link(self, section, subsection, url, title): | ||||
|         full_url = self.abs_url(url) | ||||
|         if full_url in self.urls_done: | ||||
|             # self._p('Skip (already Qd): ' + ' - '.join([section, subsection, title, url])) | ||||
|             return | ||||
| 
 | ||||
|         self.urls_done.append(full_url) | ||||
|         if len(self.articles[section]) >= self.max_articles_per_section: | ||||
|             return | ||||
|         if len(self.subsection_links[subsection]) >= \ | ||||
|             self.max_articles_per_subsection: | ||||
|             return | ||||
|         self._p('Q: ' + ' - '.join([section, subsection, title, url])) | ||||
|         full_title = string.capwords(subsection + ' - ' + title) | ||||
|         self.subsection_links[subsection].append(url) | ||||
|         self.articles[section].append( | ||||
|                         dict(title=full_title, | ||||
|                             url=full_url, | ||||
|                             date='', | ||||
|                             description='', | ||||
|                             author='', | ||||
|                             content='')) | ||||
| 
 | ||||
|     def text(self,n): | ||||
|         return self.tag_to_string(n).strip() | ||||
| 
 | ||||
|     def abs_url(self, url): | ||||
|         if 'www.hurriyetdailynews.com' in url: | ||||
|             abs_url = url | ||||
|         elif url[0] == '/': | ||||
|             abs_url = 'http://www.hurriyetdailynews.com' + url | ||||
|         else: | ||||
|             abs_url = 'http://www.hurriyetdailynews.com/' + url | ||||
|         if '#' in abs_url: | ||||
|             abs_url = ''.join(abs_url.split('#')[0:-1]) | ||||
| 
 | ||||
|         return abs_url | ||||
| 
 | ||||
|     def section_name(self,link): | ||||
|         return self.text(link).lower() | ||||
| 
 | ||||
|     def subsection_name(self,link): | ||||
|         from_fn = str(os.path.splitext(link['href'])[0]).split('/')[-1] | ||||
|         return from_fn | ||||
| 
 | ||||
|     def preprocess_raw_html(self, raw_html, url): | ||||
|         reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html)) | ||||
|         if reason_to_skip: | ||||
|             self._p('Skipping article: ' + reason_to_skip + ', ' + url) | ||||
|             # Next line will show up as an error in the logs, but ignore, see | ||||
|             #   http://www.mobileread.com/forums/showthread.php?p=2931136 | ||||
|             return None | ||||
|         else: | ||||
|             return super(self.__class__, self).preprocess_raw_html(raw_html, url) | ||||
| 
 | ||||
|     def should_skip_article(self, soup): | ||||
|         date = self.scrape_article_date(soup) | ||||
|         if not date: | ||||
|             return False | ||||
| 
 | ||||
|         age = (datetime.datetime.now() - date).days | ||||
|         if (age > self.oldest_article): | ||||
|             return "too old" | ||||
|         return False | ||||
| 
 | ||||
|     def date_from_string(self, datestring): | ||||
|         try: | ||||
|             # eg: September/17/2014 | ||||
|             dt = datetime.datetime.strptime(datestring,"%B/%d/%Y") | ||||
|         except: | ||||
|             try: | ||||
|                 # eg: September 17/2014 | ||||
|                 dt = datetime.datetime.strptime(datestring,"%B %d/%Y") | ||||
|             except: | ||||
|                 dt = None | ||||
|         if dt: | ||||
|             self._p('From string "' + datestring + '", datetime: ' + str(dt)) | ||||
|         else: | ||||
|             self._p('Could not get datetime from ' + datestring) | ||||
|         return dt | ||||
| 
 | ||||
|     def scrape_article_date(self, soup): | ||||
|         dnode =  soup.find('p', {'class':'dateagency'}) or \ | ||||
|                  soup.find('p', {'class':'Tarih'}) | ||||
|         if dnode: | ||||
|             dstring = self.text(dnode) | ||||
|             return self.date_from_string(dstring) | ||||
|         else: | ||||
|             return None | ||||
| 
 | ||||
|     def _dbg_soup_node(self, node): | ||||
|         s = '   cls: ' + str(node.get('class')).strip() + \ | ||||
|               '  id: ' + str(node.get('id')).strip() + \ | ||||
|               ' txt: ' + self.text(node) | ||||
|         return s | ||||
| 
 | ||||
|     def _p(self, msg): | ||||
|         curframe = inspect.currentframe() | ||||
|         calframe = inspect.getouterframes(curframe, 2) | ||||
|         calname = calframe[1][3].upper() | ||||
|         print('[' + calname + '] ' + msg[0:120]) | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user