diff --git a/recipes/independent.recipe b/recipes/independent.recipe index c7beac12c8..ca6b6771bb 100644 --- a/recipes/independent.recipe +++ b/recipes/independent.recipe @@ -8,6 +8,15 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString class TheIndependentNew(BasicNewsRecipe): + # flag to enable/disable article graphics on business pages/some others + # eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html + # -max dimensions can be altered using the .pictureContainer img selector in the css + _FETCH_ARTICLE_GRAPHICS = True + + #Flag to enable/disable image fetching (not business) + _FETCH_IMAGES = True + + #used for converting rating to stars _STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png' _NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png' @@ -35,7 +44,15 @@ class TheIndependentNew(BasicNewsRecipe): ] keep_only_tags =[dict(attrs={'id':'main'})] + recursions = 0 + # fixes non compliant html nesting and 'marks' article graphics links + preprocess_regexps = [ + (re.compile('(?P.*?)', re.DOTALL), + lambda match: '
' + match.group('nested') + '
'), + (re.compile('.*?Click.*?to view graphic.*?', re.DOTALL), + lambda match: '
' + match.group(0) + '
'), + ] conversion_options = { @@ -62,14 +79,34 @@ class TheIndependentNew(BasicNewsRecipe): .column-1 p,a,h1,h2,h3 { margin: 0; } .column-1 div{color:#888888; margin: 0;} .articleContent {display: block; clear:left;} + .storyTop{} + .pictureContainer img { max-width: 400px; max-height: 400px;} """ oldest_article = 1 max_articles_per_feed = 100 + _processed_urls = [] + def get_article_url(self, article): + url = super(self.__class__,self).get_article_url(article) + + title = article.get('title', None) + if title and re.search("^Video:",title): + return None + + #remove duplicates + if not (url in self._processed_urls): + self._processed_urls.append(url) + else: + url = None + return url + def preprocess_html(self, soup): + + items_to_extract = [] + for item in soup.findAll(attrs={'class' : re.compile("widget.*")}): remove = True pattern = re.compile('((articleContent)|(title))$') @@ -85,7 +122,10 @@ class TheIndependentNew(BasicNewsRecipe): #images pattern = re.compile('slideshow') if (pattern.search(item['class'])) is not None: - remove = False + if self._FETCH_IMAGES: + remove = False + else: + remove = True #social widgets always bad pattern = re.compile('socialwidget') @@ -93,30 +133,36 @@ class TheIndependentNew(BasicNewsRecipe): remove = True if remove: - item.extract() + items_to_extract.append(item) - for item in soup.findAll('a',attrs={'href' : re.compile('.*')}): - if item.img is not None: - #use full size image - img = item.findNext('img') - - img['src'] = item['href'] - - #insert caption if available - if img['title'] is not None and (len(img['title']) > 1): - tag = Tag(soup,'h3') - text = NavigableString(img['title']) - tag.insert(0,text) - - #picture before text - img.extract() - item.insert(0,img) - item.insert(1,tag) - - # remove link - item.name = "div" - item["class"]='image' - del item["href"] + for item in items_to_extract: + item.extract() + + items_to_extract = [] + + if self._FETCH_IMAGES: + for item in soup.findAll('a',attrs={'href' : re.compile('.*')}): + if item.img is not None: + #use full size image + img = item.findNext('img') + + img['src'] = item['href'] + + #insert caption if available + if img['title'] is not None and (len(img['title']) > 1): + tag = Tag(soup,'h3') + text = NavigableString(img['title']) + tag.insert(0,text) + + #picture before text + img.extract() + item.insert(0,img) + item.insert(1,tag) + + # remove link + item.name = "div" + item["class"]='image' + del item["href"] #remove empty subtitles @@ -127,13 +173,12 @@ class TheIndependentNew(BasicNewsRecipe): """ subtitle = soup.find('h3',attrs={'class' : 'subtitle'}) if subtitle is not None: - subtitleText = subtitle.findNext('p') - if subtitleText is not None: - if len(subtitleText.contents[0]) <= 1 : - subtitleText.extract() - subtitle.extract() - - + subtitleText = subtitle.findNext('p') + if subtitleText is not None: + if len(subtitleText.contents[0]) <= 1 : + subtitleText.extract() + subtitle.extract() + #replace rating numbers with stars for item in soup.findAll('div',attrs={ 'class' : 'starRating'}): @@ -141,10 +186,64 @@ class TheIndependentNew(BasicNewsRecipe): soup2 = self._insertRatingStars(soup,item) if soup2 is not None: soup = soup2 + + + #remove empty paragraph tags in storyTop which can leave a space + #between first paragraph and rest of story + storyTop = soup.find('div',attrs={ 'class' : ['storyTop']}) + for item in storyTop.findAll('p'): + if item.contents is not None and len(item.contents[0]) <= 1 : + items_to_extract.append(item) + + for item in items_to_extract: + item.extract() + + items_to_extract = [] + #remove line breaks immediately next to tags with default margins + #to prevent double line spacing and narrow columns of text + storyTop = soup.find('div',attrs={ 'class' : ['storyTop']}) + self._remove_undesired_line_breaks_from_tag(storyTop,soup) + + + #replace article graphics link with the graphics themselves + if self._FETCH_ARTICLE_GRAPHICS: + items_to_insert = [] + for item in soup.findAll('div', attrs={'class' : ['article-graphic']}): + strong = item.find('strong') + for child in strong: + if isinstance(child,Tag): + if str(child.name) == 'a': + items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup)) + + for item in items_to_insert: + item[0].replaceWith(item[1]) + + for item in items_to_extract: + item.extract() + return soup + + def _get_article_graphic(self,old_item,url,soup): + + items_to_insert = [] + + if re.search('\.jpg$',str(url)): + div = Tag(soup,'div') + div['class'] = 'pictureContainer' + img = Tag(soup,'img') + img['src'] = url + img['alt'] = 'article graphic' + div.insert(0,img) + items_to_insert.append((old_item,div,)) + return items_to_insert + + soup2 = self.index_to_soup(url) + for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}): + items_to_insert.append((old_item,item),) + return items_to_insert def _insertRatingStars(self,soup,item): @@ -167,6 +266,7 @@ class TheIndependentNew(BasicNewsRecipe): def postprocess_html(self,soup, first_fetch): #find broken images and remove captions + items_to_extract = [] for item in soup.findAll('div', attrs={'class' : 'image'}): img = item.findNext('img') if img is not None and img['src'] is not None: @@ -175,20 +275,114 @@ class TheIndependentNew(BasicNewsRecipe): if pattern.match(img["src"]) is not None: caption = img.findNextSibling('h3') if caption is not None: - caption.extract() - img.extract() - return soup - - + items_to_extract.append(caption) + items_to_extract.append(img) + for item in items_to_extract: + item.extract() + return soup + + def _recurisvely_linearise_tag_tree( + self, + item, + linearised= None, + count=0, + limit = 100 + ): + linearised = linearised or [] + count = count + 1 + if count > limit: + return linearised + if not (isinstance(item,Tag)): + return linearised + for nested in item: + linearised.append(nested) + linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count) + return linearised + + + def _get_previous_tag(self,current_index, tag_tree): + if current_index == 0: + return None + else: + return tag_tree[current_index - 1] + + + def _get_next_tag(self,current_index, tag_tree): + if current_index < len(tag_tree) - 1: + return tag_tree[current_index + 1] + else: + return None + + + def _list_match(self,test_str, list_regex): + for regex in list_regex: + match = re.match(regex, test_str) + if match is not None: + return True + return False + def _remove_undesired_line_breaks_from_tag(self,parent,soup): + + if parent is None: + return + + + tag_tree = self._recurisvely_linearise_tag_tree(parent) + items_to_remove = [] + + + for item in tag_tree: + if item == u'\n': + items_to_remove.append(item) + continue; + + for item in items_to_remove: + tag_tree.remove(item) + + + spaced_tags = [r'p', r'h\d', r'blockquote'] + tags_to_extract = [] + tags_to_replace = [] + for (i, tag) in enumerate(tag_tree): + if isinstance(tag, Tag): + if str(tag) == '
': + previous_tag = self._get_previous_tag(i, tag_tree) + + if isinstance(previous_tag, Tag): + previous_tag_is_spaced = previous_tag is not None\ + and self._list_match(str(previous_tag.name), + spaced_tags) + else: + previous_tag_is_spaced = False + + next_tag = self._get_next_tag(i, tag_tree) + + if isinstance(next_tag, Tag): + next_tag_is_spaced = next_tag is not None\ + and self._list_match(str(next_tag.name), spaced_tags) + else: + next_tag_is_spaced = False + + if previous_tag_is_spaced or next_tag_is_spaced or i == 0\ + or i == len(tag_tree) - 1: + tags_to_extract.append(tag) + else: + tags_to_replace.append((tag,NavigableString(' '),)) + + + for pair in tags_to_replace: + pair[0].replaceWith(pair[1]) + for tag in tags_to_extract: + tag.extract() + feeds = [ (u'News - UK', u'http://www.independent.co.uk/news/uk/?service=rss'), (u'News - World', u'http://www.independent.co.uk/news/world/?service=rss'), (u'News - Business', - u'http://www.independent.co.uk/news/business/?service=rss'), + u'http://www.independent.co.uk/news/business/?service=rss'), (u'News - People', u'http://www.independent.co.uk/news/people/?service=rss'), (u'News - Science', @@ -289,6 +483,3 @@ class TheIndependentNew(BasicNewsRecipe): u'http://www.independent.co.uk/extras/indybest/?service=rss'), ] - - - diff --git a/src/calibre/manual/develop.rst b/src/calibre/manual/develop.rst index 1d5161db9a..3524f9b3c3 100755 --- a/src/calibre/manual/develop.rst +++ b/src/calibre/manual/develop.rst @@ -136,7 +136,7 @@ the previously checked out |app| code directory, for example:: cd /Users/kovid/work/calibre -calibre is the directory that contains the src and resources sub-directories. Ensure you have installed the |app| commandline tools via :guilabel:Preferences->Advanced->Miscellaneous in the |app| GUI. +calibre is the directory that contains the src and resources sub-directories. Ensure you have installed the |app| commandline tools via :guilabel:`Preferences->Advanced->Miscellaneous` in the |app| GUI. The next step is to set the environment variable ``CALIBRE_DEVELOP_FROM`` to the absolute path of the src directory. So, following the example above, it would be ``/Users/kovid/work/calibre/src``. Apple diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index 1ea073d318..fb94a0d479 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -242,10 +242,6 @@ Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you get timeout errors while browsing the calibre catalog in Stanza, try increasing the connection timeout value in the stanza settings. Go to Info->Settings and increase the value of Download Timeout. -.. note:: - As of iOS version 5 Stanza no longer works on Apple devices. Alternatives to Stanza are discussed `in this forum `_. - - Using iBooks **************