From bdf2cd48ddff2edb5b23bfbc971716ded8130994 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 19:24:01 -0600 Subject: [PATCH 1/4] ... --- resources/recipes/ming_pao.recipe | 51 +++++++++++++++---------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe index 6a61405698..162a3c774e 100644 --- a/resources/recipes/ming_pao.recipe +++ b/resources/recipes/ming_pao.recipe @@ -1,7 +1,9 @@ -cense__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2010, Eddie Lau' ''' modified from Singtao Toronto calibre recipe by rty +Change Log: +2010/10/31: skip repeated articles in section pages ''' import datetime @@ -23,42 +25,37 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe): recursions = 0 conversion_options = {'linearize_tables':True} masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' - keep_only_tags = [dict(name='h1'), dict(attrs={'id':['newscontent01','newscontent02']})] def get_fetchdate(self): dt_utc = datetime.datetime.utcnow() - # convert UTC to local hk time - dt_local = dt_utc - datetime.timedelta(-8.0/24) + # convert UTC to local hk time - at around HKT 5.30am, all news are available + dt_local = dt_utc - datetime.timedelta(-2.5/24) return dt_local.strftime("%Y%m%d") def parse_index(self): - feeds = [] - dateStr = self.get_fetchdate() - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) + feeds = [] + dateStr = self.get_fetchdate() + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) return feeds def parse_section(self, url): - dateStr = self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['bullet']}) - current_articles = [] - for i in divs: - a = i.find('a', href = True) - title = self.tag_to_string(a) - url = a.get('href', False) - url = 'http://news.mingpao.com/' + dateStr + '/' +url + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet']}) + current_articles = [] + included_urls = [] + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + if url not in included_urls: current_articles.append({'title': title, 'url': url, 'description':''}) - return current_articles - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll(width=True): - del item['width'] - return soup + included_urls.append(url) + return current_articles From 21731b3c046da70cdc63fa348f164b9d5f4218cc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 21:00:38 -0600 Subject: [PATCH 2/4] ... --- src/calibre/utils/html2text.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/calibre/utils/html2text.py b/src/calibre/utils/html2text.py index afe5a0aded..0eb84a3d38 100644 --- a/src/calibre/utils/html2text.py +++ b/src/calibre/utils/html2text.py @@ -9,7 +9,7 @@ __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] # Support decoded entities with unifiable. if not hasattr(__builtins__, 'True'): True, False = 1, 0 -import re, sys, urllib, htmlentitydefs, codecs, StringIO, types +import re, sys, urllib, htmlentitydefs, codecs import sgmllib import urlparse sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') @@ -18,17 +18,17 @@ try: from textwrap import wrap except: pass # Use Unicode characters instead of their ascii psuedo-replacements -UNICODE_SNOB = 0 +UNICODE_SNOB = 1 # Put the links after each paragraph instead of at the end. LINKS_EACH_PARAGRAPH = 0 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) -BODY_WIDTH = 78 +BODY_WIDTH = 0 # Don't show internal links (href="#local-anchor") -- corresponding link targets # won't be visible in the plain text file anyway. -SKIP_INTERNAL_LINKS = False +SKIP_INTERNAL_LINKS = True ### Entity Nonsense ### @@ -433,8 +433,9 @@ if __name__ == "__main__": j = urllib.urlopen(baseurl) try: from feedparser import _getCharacterEncoding as enc + enc except ImportError: - enc = lambda x, y: ('utf-8', 1) + enc = lambda x, y: ('utf-8', 1) text = j.read() encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' From 073bf833712d7827ebe2ecfcb0b36478ea75d878 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 22:22:49 -0600 Subject: [PATCH 3/4] El Faro de Vigo by Jefferson Frantz. Fixes #405 (New news feed) --- resources/recipes/el_faro.recipe | 77 ++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 resources/recipes/el_faro.recipe diff --git a/resources/recipes/el_faro.recipe b/resources/recipes/el_faro.recipe new file mode 100644 index 0000000000..ec1b74b5cb --- /dev/null +++ b/resources/recipes/el_faro.recipe @@ -0,0 +1,77 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ElFaroDeVigo(BasicNewsRecipe): + title = u'El Faro de Vigo' + oldest_article = 1 + max_articles_per_feed = 100 + __author__ = 'Jefferson Frantz' + description = 'Noticias de Vigo' + timefmt = ' [%d %b, %Y]' + language = 'es' + encoding = 'cp1252' + no_stylesheets = True + remove_javascript = True + + feeds = [ +## (u'Vigo', u'http://www.farodevigo.es/elementosInt/rss/1'), +## (u'Gran Vigo', u'http://www.farodevigo.es/elementosInt/rss/2'), + (u'Galicia', u'http://www.farodevigo.es/elementosInt/rss/4'), + (u'España', u'http://www.farodevigo.es/elementosInt/rss/6'), + (u'Mundo', u'http://www.farodevigo.es/elementosInt/rss/7'), +## (u'Opinión', u'http://www.farodevigo.es/elementosInt/rss/5'), + (u'Economía', u'http://www.farodevigo.es/elementosInt/rss/10'), + (u'Sociedad y Cultura', u'http://www.farodevigo.es/elementosInt/rss/8'), + (u'Sucesos', u'http://www.farodevigo.es/elementosInt/rss/9'), + (u'Deportes', u'http://www.farodevigo.es/elementosInt/rss/11'), + (u'Agenda', u'http://www.farodevigo.es/elementosInt/rss/21'), + (u'Gente', u'http://www.farodevigo.es/elementosInt/rss/24'), + (u'Televisión', u'http://www.farodevigo.es/elementosInt/rss/25'), + (u'Ciencia y Tecnología', u'http://www.farodevigo.es/elementosInt/rss/26')] + + extra_css = '''.noticia_texto{ font-family: sans-serif; font-size: medium; text-align: justify } + h1{font-family: serif; font-size: x-large; font-weight: bold; color: #000000; text-align: center} + h2{font-family: serif; font-size: medium; font-weight: bold; color: #000000; text-align: left} + .enlacenegrita10{font-family: serif; font-size: small; font-weight: bold; color: #000000; text-align: left} + .noticia_titular{font-family: serif; font-size: x-large; font-weight: bold; color: #000000; text-align: center}''' + + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + + url = 'http://estaticos00.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif' + fitem = soup.find('img',src=url) + if fitem: + par = fitem.parent + par.extract() + url = 'http://estaticos01.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif' + fitem = soup.find('img',src=url) + if fitem: + par = fitem.parent + par.extract() + url = 'http://estaticos02.farodevigo.es//elementosWeb/mediaweb/images/compartir/barrapunto.gif' + fitem = soup.find('img',src=url) + if fitem: + par = fitem.parent + par.extract() + + return self.adeify_images(soup) + + def postprocess_html(self, soup, first_fetch): + divs = soup.findAll(True, {'class':'enlacenegrita10'}) + for div in divs: + div['align'] = 'left' + + return soup + + + keep_only_tags = [dict(name='div', attrs={'class':['noticias']})] + + remove_tags = [ + dict(name=['object','link','script','ul','iframe','ol']) + ,dict(name='div', attrs={'class':['noticiadd2', 'cintillo2', 'noticiadd', 'noticiadd2']}) + ,dict(name='div', attrs={'class':['imagen_derecha', 'noticiadd3', 'extraHTML']}) + + ] + + From 0c8684fa2191d1329860de55c364718c991db469 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Oct 2010 22:43:24 -0600 Subject: [PATCH 4/4] Fix #7369 (0.7.26) --- src/calibre/ebooks/metadata/amazon.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index 9c89016e8b..81d996c6a7 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -72,7 +72,10 @@ def get_metadata(br, asin, mi): return False raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] - root = soupparser.fromstring(raw) + try: + root = soupparser.fromstring(raw) + except: + return False ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]') if ratings: pat = re.compile(r'([0-9.]+) out of (\d+) stars')