From 1b20a25051132bcc2d0ab7f6a3fd910f0b591f93 Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 09:52:29 +0200 Subject: [PATCH 01/20] Delete berria.recipe --- recipes/berria.recipe | 44 ------------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 recipes/berria.recipe diff --git a/recipes/berria.recipe b/recipes/berria.recipe deleted file mode 100644 index bb04e63388..0000000000 --- a/recipes/berria.recipe +++ /dev/null @@ -1,44 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2012, Alayn Gortazar ' -''' -www.berria.info -''' - -from calibre.web.feeds.news import BasicNewsRecipe - - -class Berria(BasicNewsRecipe): - title = 'Berria' - __author__ = 'Alayn Gortazar' - description = 'Euskal Herriko euskarazko egunkaria' - publisher = 'Berria' - category = 'news, politics, sports, Basque Country' - oldest_article = 2 - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - language = 'eu' - remove_empty_feeds = True - masthead_url = 'http://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/Berria_Logo.svg/400px-Berria_Logo.svg.png' - - keep_only_tags = [ - dict(id='goiburua'), - dict(name='div', attrs={'class': ['ber_ikus']}), - dict(name='section', attrs={'class': 'ber_ikus'}) - ] - remove_tags = [ - dict(name='a', attrs={'class': 'iruzkinak'}), - dict(name='div', attrs={'class': 'laguntzaileak'}) - ] - - extra_css = '#goiburua{font-weight: bold} .zintiloa{font-size: small} .sarrera{color:#666} .titularra{font-size: x-large} .sarrera{font-weight: bold} .argazoin{color:#666; font-size: small}' # noqa - - feeds = [ - (u'Edizioa jarraia', u'http://berria.info/rss/ediziojarraia.xml'), - (u'Iritzia', u'http://berria.info/rss/iritzia.xml'), - (u'Euskal Herria', u'http://berria.info/rss/euskalherria.xml'), - (u'Ekonomia', u'http://berria.info/rss/ekonomia.xml'), - (u'Mundua', u'http://berria.info/rss/mundua.xml'), - (u'Kirola', u'http://berria.info/rss/kirola.xml'), - (u'Plaza', u'http://berria.info/rss/plaza.xml') - ] From fde2794af4b6464a7374eb228269784906c274de Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 09:53:11 +0200 Subject: [PATCH 02/20] Delete bbc_arabic.recipe --- recipes/bbc_arabic.recipe | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 recipes/bbc_arabic.recipe diff --git a/recipes/bbc_arabic.recipe b/recipes/bbc_arabic.recipe deleted file mode 100644 index 0884f111f4..0000000000 --- a/recipes/bbc_arabic.recipe +++ /dev/null @@ -1,21 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe - - -class BBCArabic(BasicNewsRecipe): - title = u'BBC Arabic Middle East' - oldest_article = 7 - max_articles_per_feed = 100 - extra_css = 'body { text-align: right; direction:rtl; } ' - auto_cleanup = True - language = 'ar' - __author__ = 'logophile777' - remove_tags = [ - {'class': ['emp-alt-handheld', 'emp-noflash', - 'emp-flashlink', 'emp-alt-screen']} - ] - - feeds = [(u'BBC Arabic Middle East', - u'http://www.bbc.co.uk/arabic/middleeast/index.xml')] - - def print_version(self, url): - return url + '?print=1' From 7ed9d11811a240c3336f765c8134e2ad2caee3cf Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 09:56:58 +0200 Subject: [PATCH 03/20] Delete dnevni_avaz.recipe --- recipes/dnevni_avaz.recipe | 76 -------------------------------------- 1 file changed, 76 deletions(-) delete mode 100644 recipes/dnevni_avaz.recipe diff --git a/recipes/dnevni_avaz.recipe b/recipes/dnevni_avaz.recipe deleted file mode 100644 index 45916f70a7..0000000000 --- a/recipes/dnevni_avaz.recipe +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env python2 - -__license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' - -''' -dnevniavaz.ba -''' - -import re -from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag - - -def new_tag(soup, name, attrs=()): - impl = getattr(soup, 'new_tag', None) - if impl is not None: - return impl(name, attrs=dict(attrs)) - return Tag(soup, name, attrs=attrs or None) - - -class DnevniAvaz(BasicNewsRecipe): - title = 'Dnevni Avaz' - __author__ = 'Darko Miletic' - description = 'Latest news from Bosnia' - publisher = 'Dnevni Avaz' - category = 'news, politics, Bosnia and Herzegovina' - oldest_article = 2 - max_articles_per_feed = 100 - no_stylesheets = True - encoding = 'utf-8' - use_embedded_content = False - remove_javascript = True - cover_url = 'http://www.dnevniavaz.ba/img/logo.gif' - lang = 'bs-BA' - language = 'bs' - - direction = 'ltr' - - extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' # noqa - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True - } - - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - - keep_only_tags = [dict(name='div', attrs={'id': [ - 'fullarticle-title', 'fullarticle-leading', 'fullarticle-date', 'fullarticle-text', 'articleauthor']})] - - remove_tags = [dict(name=['object', 'link', 'base'])] - - feeds = [ - (u'Najnovije', u'http://www.dnevniavaz.ba/rss/novo'), (u'Najpopularnije', - u'http://www.dnevniavaz.ba/rss/popularno') - ] - - def replace_tagname(self, soup, tagname, tagid, newtagname): - headtag = soup.find(tagname, attrs={'id': tagid}) - if headtag: - headtag.name = newtagname - return - - def preprocess_html(self, soup): - soup.html['xml:lang'] = self.lang - soup.html['lang'] = self.lang - mlang = new_tag(soup, 'meta', [ - ("http-equiv", "Content-Language"), ("content", self.lang)]) - mcharset = new_tag(soup, 'meta', [ - ("http-equiv", "Content-Type"), ("content", "text/html; charset=UTF-8")]) - soup.head.insert(0, mlang) - soup.head.insert(1, mcharset) - self.replace_tagname(soup, 'div', 'fullarticle-title', 'h1') - self.replace_tagname(soup, 'div', 'fullarticle-leading', 'h3') - self.replace_tagname(soup, 'div', 'fullarticle-date', 'h5') - return self.adeify_images(soup) From a2b03714e8379475f0c38bfe3f94d1b5d4d80086 Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 09:59:35 +0200 Subject: [PATCH 04/20] Delete blog_da_cidadania.recipe --- recipes/blog_da_cidadania.recipe | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 recipes/blog_da_cidadania.recipe diff --git a/recipes/blog_da_cidadania.recipe b/recipes/blog_da_cidadania.recipe deleted file mode 100644 index b94ad18625..0000000000 --- a/recipes/blog_da_cidadania.recipe +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- - -from calibre.web.feeds.news import BasicNewsRecipe - - -class BlogdaCidadania(BasicNewsRecipe): - title = 'Blog da Cidadania' - __author__ = 'Diniz Bortolotto' - description = 'Posts do Blog da Cidadania' - oldest_article = 7 - max_articles_per_feed = 50 - encoding = 'utf8' - publisher = 'Eduardo Guimaraes' - category = 'politics, Brazil' - language = 'pt_BR' - publication_type = 'politics portal' - - feeds = [(u'Blog da Cidadania', u'http://www.blogcidadania.com.br/feed/')] - - reverse_article_order = True From 897df23ab93718911aeb4bc30c381f5618082ebc Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 10:01:06 +0200 Subject: [PATCH 05/20] Delete escrevinhador.recipe --- recipes/escrevinhador.recipe | 28 ---------------------------- 1 file changed, 28 deletions(-) delete mode 100644 recipes/escrevinhador.recipe diff --git a/recipes/escrevinhador.recipe b/recipes/escrevinhador.recipe deleted file mode 100644 index 3011e88bbe..0000000000 --- a/recipes/escrevinhador.recipe +++ /dev/null @@ -1,28 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe - - -class Escrevinhador(BasicNewsRecipe): - title = 'Blog Escrevinhador' - __author__ = 'Diniz Bortolotto' - description = 'Posts do Blog Escrevinhador' - publisher = 'Rodrigo Viana' - oldest_article = 5 - max_articles_per_feed = 20 - category = 'news, politics, Brazil' - language = 'pt_BR' - publication_type = 'news and politics portal' - use_embedded_content = False - no_stylesheets = True - remove_javascript = True - - feeds = [(u'Blog Escrevinhador', u'http://www.rodrigovianna.com.br/feed')] - - reverse_article_order = True - - remove_tags_after = [dict(name='div', attrs={'class': 'text'})] - - remove_tags = [ - dict(id='header'), - dict(name='p', attrs={'class': 'tags'}), - dict(name='div', attrs={'class': 'sociable'}) - ] From daaf4c4b550199db8aad422c4fc23c7124934be2 Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:29:47 +0200 Subject: [PATCH 06/20] Delete carta_capital.recipe --- recipes/carta_capital.recipe | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 recipes/carta_capital.recipe diff --git a/recipes/carta_capital.recipe b/recipes/carta_capital.recipe deleted file mode 100644 index 9d189bab07..0000000000 --- a/recipes/carta_capital.recipe +++ /dev/null @@ -1,29 +0,0 @@ -# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai -from __future__ import absolute_import, division, print_function, unicode_literals - -from calibre.web.feeds.news import BasicNewsRecipe - - -class AdvancedUserRecipe1380852962(BasicNewsRecipe): - title = u'Carta Capital' - __author__ = 'Erico Lisboa' - language = 'pt_BR' - oldest_article = 15 - max_articles_per_feed = 100 - auto_cleanup = True - use_embedded_content = False - - feeds = [(u'Pol\xedtica', - u'http://www.cartacapital.com.br/politica/politica/rss'), (u'Economia', - u'http://www.cartacapital.com.br/economia/economia/atom.xml'), - (u'Sociedade', - u'http://www.cartacapital.com.br/sociedade/sociedade/atom.xml'), - (u'Internacional', - u'http://www.cartacapital.com.br/internacional/internacional/atom.xml'), - (u'Tecnologia', - u'http://www.cartacapital.com.br/tecnologia/tecnologia/atom.xml'), - (u'Cultura', - u'http://www.cartacapital.com.br/cultura/cultura/atom.xml'), - (u'Sa\xfade', u'http://www.cartacapital.com.br/saude/saude/atom.xml'), - (u'Educa\xe7\xe3o', - u'http://www.cartacapital.com.br/educacao/educacao/atom.xml')] From faf65ab82eced9b5e88c11d8c82d6ea637c78772 Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:30:16 +0200 Subject: [PATCH 07/20] Delete caros_amigos.recipe --- recipes/caros_amigos.recipe | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 recipes/caros_amigos.recipe diff --git a/recipes/caros_amigos.recipe b/recipes/caros_amigos.recipe deleted file mode 100644 index b1d5b2cb8e..0000000000 --- a/recipes/caros_amigos.recipe +++ /dev/null @@ -1,16 +0,0 @@ -__copyright__ = '2011, Pablo Aldama ' -from calibre.web.feeds.news import BasicNewsRecipe - - -class AdvancedUserRecipe1311839910(BasicNewsRecipe): - title = u'Caros Amigos' - oldest_article = 20 - max_articles_per_feed = 100 - language = 'pt_BR' - __author__ = 'Pablo Aldama' - - feeds = [(u'Caros Amigos', - u'http://carosamigos.terra.com.br/index2/index.php?format=feed&type=rss')] - keep_only_tags = [dict(name='div', attrs={'class': ['blog']}), dict(name='div', attrs={'class': ['blogcontent']}) - ] - remove_tags = [dict(name='div', attrs={'class': 'addtoany'})] From a193acc8262dd64a50c7ac36920351d916c187e0 Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:30:42 +0200 Subject: [PATCH 08/20] Delete idg_now.recipe --- recipes/idg_now.recipe | 49 ------------------------------------------ 1 file changed, 49 deletions(-) delete mode 100644 recipes/idg_now.recipe diff --git a/recipes/idg_now.recipe b/recipes/idg_now.recipe deleted file mode 100644 index 74657d5ab6..0000000000 --- a/recipes/idg_now.recipe +++ /dev/null @@ -1,49 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe - - -class IDGNow(BasicNewsRecipe): - title = 'IDG Now!' - __author__ = 'Diniz Bortolotto' - description = 'Posts do IDG Now!' - oldest_article = 7 - max_articles_per_feed = 20 - encoding = 'utf8' - publisher = 'Now!Digital Business Ltda.' - category = 'technology, telecom, IT, Brazil' - language = 'pt_BR' - publication_type = 'technology portal' - use_embedded_content = False - extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' - - def get_article_url(self, article): - link = article.get('link', None) - if link is None: - return article - if link.split('/')[-1] == "story01.htm": - link = link.split('/')[-2] - a = ['0B', '0C', '0D', '0E', '0F', '0G', - '0I', '0N', '0L0S', '0A', '0J3A'] - b = ['.', '/', '?', '-', '=', '&', '_', '.com', 'www.', '0', ':'] - for i in range(0, len(a)): - link = link.replace(a[i], b[i]) - link = link.split('&')[-3] - link = link.split('=')[1] - link = link + "/IDGNoticiaPrint_view" - return link - - feeds = [ - (u'Ultimas noticias', u'http://rss.idgnow.com.br/c/32184/f/499640/index.rss'), - (u'Computa\xe7\xe3o Corporativa', - u'http://rss.idgnow.com.br/c/32184/f/499643/index.rss'), - (u'Carreira', u'http://rss.idgnow.com.br/c/32184/f/499644/index.rss'), - (u'Computa\xe7\xe3o Pessoal', - u'http://rss.idgnow.com.br/c/32184/f/499645/index.rss'), - (u'Internet', u'http://rss.idgnow.com.br/c/32184/f/499646/index.rss'), - (u'Mercado', u'http://rss.idgnow.com.br/c/32184/f/419982/index.rss'), - (u'Seguran\xe7a', - u'http://rss.idgnow.com.br/c/32184/f/499647/index.rss'), - (u'Telecom e Redes', - u'http://rss.idgnow.com.br/c/32184/f/499648/index.rss') - ] - - reverse_article_order = True From c581643026afa38faf0c8325ba8f9e8966e0fc9e Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:31:12 +0200 Subject: [PATCH 09/20] Delete noticias_r7.recipe --- recipes/noticias_r7.recipe | 43 -------------------------------------- 1 file changed, 43 deletions(-) delete mode 100644 recipes/noticias_r7.recipe diff --git a/recipes/noticias_r7.recipe b/recipes/noticias_r7.recipe deleted file mode 100644 index 3bf88a87f1..0000000000 --- a/recipes/noticias_r7.recipe +++ /dev/null @@ -1,43 +0,0 @@ -import re -from calibre.web.feeds.news import BasicNewsRecipe - - -class PortalR7(BasicNewsRecipe): - title = 'Noticias R7' - __author__ = 'Diniz Bortolotto' - description = 'Noticias Portal R7' - oldest_article = 2 - max_articles_per_feed = 20 - encoding = 'utf8' - publisher = 'Rede Record' - category = 'news, Brazil' - language = 'pt_BR' - publication_type = 'newsportal' - use_embedded_content = False - no_stylesheets = True - remove_javascript = True - remove_attributes = ['style'] - - feeds = [ - (u'Brasil', u'http://www.r7.com/data/rss/brasil.xml'), - (u'Economia', u'http://www.r7.com/data/rss/economia.xml'), - (u'Internacional', - u'http://www.r7.com/data/rss/internacional.xml'), - (u'Tecnologia e Ci\xeancia', - u'http://www.r7.com/data/rss/tecnologiaCiencia.xml') - ] - reverse_article_order = True - - keep_only_tags = [dict(name='div', attrs={'class': 'materia'})] - remove_tags = [ - dict(id=['espalhe', 'report-erro']), - dict(name='ul', attrs={'class': 'controles'}), - dict(name='ul', attrs={'class': 'relacionados'}), - dict(name='div', attrs={'class': 'materia_banner'}), - dict(name='div', attrs={'class': 'materia_controles'}) - ] - - preprocess_regexps = [ - (re.compile(r'
.*
', re.DOTALL | re.IGNORECASE), - lambda match: '
') - ] From 578c05905d0bb5716111965cf72997ba63088d75 Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:31:35 +0200 Subject: [PATCH 10/20] Delete noticias_unb.recipe --- recipes/noticias_unb.recipe | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 recipes/noticias_unb.recipe diff --git a/recipes/noticias_unb.recipe b/recipes/noticias_unb.recipe deleted file mode 100644 index a38e8648d8..0000000000 --- a/recipes/noticias_unb.recipe +++ /dev/null @@ -1,24 +0,0 @@ -# -*- coding: utf-8 -*- - -from calibre.web.feeds.news import BasicNewsRecipe - - -class NoticiasUnB(BasicNewsRecipe): - title = 'Noticias UnB' - __author__ = 'Diniz Bortolotto' - description = 'Noticias da UnB' - oldest_article = 5 - max_articles_per_feed = 20 - category = 'news, educational, Brazil' - language = 'pt_BR' - publication_type = 'newsportal' - use_embedded_content = False - no_stylesheets = True - remove_javascript = True - - feeds = [(u'UnB AgĂȘncia', u'http://www.unb.br/noticias/rss/noticias.rss')] - - reverse_article_order = True - - def print_version(self, url): - return url.replace('http://', 'http://www.unb.br/noticias/print_email/imprimir.php?u=http://') From 2a8b9133de053ceb54c455050a41f98658a4955e Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:39:34 +0200 Subject: [PATCH 11/20] Delete ming_pao_toronto.recipe --- recipes/ming_pao_toronto.recipe | 1018 ------------------------------- 1 file changed, 1018 deletions(-) delete mode 100644 recipes/ming_pao_toronto.recipe diff --git a/recipes/ming_pao_toronto.recipe b/recipes/ming_pao_toronto.recipe deleted file mode 100644 index 24792ae76a..0000000000 --- a/recipes/ming_pao_toronto.recipe +++ /dev/null @@ -1,1018 +0,0 @@ -from __future__ import print_function -__license__ = 'GPL v3' -__copyright__ = '2010-2011, Eddie Lau' - -# Region - Hong Kong, Vancouver, Toronto -__Region__ = 'Toronto' -# Users of Kindle 3 with limited system-level CJK support -# please replace the following "True" with "False". (Default: True) -__MakePeriodical__ = True -# Turn below to True if your device supports display of CJK titles -# (Default: False) -__UseChineseTitle__ = False -# Set it to False if you want to skip images (Default: True) -__KeepImages__ = True -# Set it to True if you want to include a summary in Kindle's article view -# (Default: False) -__IncludeSummary__ = False -# Set it to True if you want thumbnail images in Kindle's article view -# (Default: True) -__IncludeThumbnails__ = True -# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True) -__UseLife__ = True -# (HK only) It is to disable premium content (Default: False) -__InclPremium__ = False -# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True) -__ParsePFF__ = True -# (HK only) Turn below to True if you wish hi-res images (Default: False) -__HiResImg__ = False -# Override the date returned by the program if specifying a YYYYMMDD below -__Date__ = '' - - -''' -Change Log: -2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away - from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day - download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. -2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010' -2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt -2011/10/19: fix a bug in txt source parsing -2011/10/17: disable fetching of premium content, also improved txt source parsing -2011/10/04: option to get hi-res photos for the articles -2011/09/21: fetching "column" section is made optional. -2011/09/18: parse "column" section stuff from source text file directly. -2011/09/07: disable "column" section as it is no longer offered free. -2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source - provide options to remove all images in the file -2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages -2011/03/06: add new articles for finance section, also a new section "Columns" -2011/02/28: rearrange the sections - [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles - View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues" - folder in Kindle 3 -2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles - clean up the indentation -2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list - (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) -2010/11/22: add English section, remove eco-news section which is not updated daily, correct - ordering of articles -2010/11/12: add news image and eco-news section -2010/11/08: add parsing of finance section -2010/11/06: temporary work-around for Kindle device having no capability to display unicode - in section/article list. -2010/10/31: skip repeated articles in section pages -''' - -from calibre.utils.date import now as nowf -import os -import datetime -import re -import mechanize -from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup -from calibre.ebooks.metadata.opf2 import OPFCreator -from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.metadata import MetaInformation -from calibre.utils.localization import canonicalize_lang - -# MAIN CLASS - - -class MPRecipe(BasicNewsRecipe): - if __Region__ == 'Hong Kong': - if __UseChineseTitle__ is True: - title = u'\u660e\u5831 (\u9999\u6e2f)' - else: - title = 'Ming Pao - Hong Kong' - description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' - category = 'Chinese, News, Hong Kong' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' # noqa - masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' - keep_only_tags = [dict(name='h1'), - # for entertainment page title - dict(name='font', attrs={ - 'style': ['font-size:14pt; line-height:160%;']}), - # for column articles title - dict(name='font', attrs={'color': ['AA0000']}), - # for heading from txt - dict(attrs={'class': ['heading']}), - # entertainment and column page content - dict(attrs={'id': ['newscontent']}), - dict( - attrs={'id': ['newscontent01', 'newscontent02']}), - # for content from txt - dict(attrs={'class': ['content']}), - dict(attrs={'class': ['photo']}), - dict(name='table', attrs={'width': ['100%'], 'border':['0'], 'cellspacing':[ - '5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com - # images for source from life.mingpao.com - dict(name='img', attrs={ - 'width': ['180'], 'alt':['????']}), - # for images from txt - dict(attrs={'class': ['images']}) - ] - if __KeepImages__: - remove_tags = [dict(name='style'), - # for the finance page from mpfinance.com - dict(attrs={'id': ['newscontent135']}), - # article date in life.mingpao.com article - dict(name='font', attrs={ - 'size': ['2'], 'color':['666666']}), - # dict(name='table') # for content fetched from - # life.mingpao.com - ] - else: - remove_tags = [dict(name='style'), - # for the finance page from mpfinance.com - dict(attrs={'id': ['newscontent135']}), - # article date in life.mingpao.com article - dict(name='font', attrs={ - 'size': ['2'], 'color':['666666']}), - dict(name='img'), - # dict(name='table') # for content fetched from - # life.mingpao.com - ] - remove_attributes = ['width'] - preprocess_regexps = [ - (re.compile(r'
', re.DOTALL | re.IGNORECASE), - lambda match: '

'), - (re.compile(r'

', re.DOTALL | re.IGNORECASE), - lambda match: ''), - (re.compile(r'

', re.DOTALL | re.IGNORECASE), # for entertainment page - lambda match: ''), - # skip
after title in life.mingpao.com fetched article - (re.compile(r"

", re.DOTALL | re.IGNORECASE), - lambda match: "
"), - (re.compile(r"

", re.DOTALL | re.IGNORECASE), - lambda match: "") - ] - elif __Region__ == 'Vancouver': - if __UseChineseTitle__ is True: - title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)' - else: - title = 'Ming Pao - Vancouver' - description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)' - category = 'Chinese, News, Vancouver' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' # noqa - masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif' - keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), - dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[ - '3'], 'cellpadding':['3'], 'id':['tblContent3']}), - dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[ - '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), - ] - if __KeepImages__: - # the magnifier icon - remove_tags = [ - dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})] - else: - remove_tags = [dict(name='img')] - remove_attributes = ['width'] - preprocess_regexps = [(re.compile(r' ', re.DOTALL | re.IGNORECASE), - lambda match: ''), - ] - elif __Region__ == 'Toronto': - if __UseChineseTitle__ is True: - title = u'\u660e\u5831 (\u591a\u502b\u591a)' - else: - title = 'Ming Pao - Toronto' - description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)' - category = 'Chinese, News, Toronto' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' # noqa - masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif' - keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), - dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[ - '3'], 'cellpadding':['3'], 'id':['tblContent3']}), - dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[ - '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), - ] - if __KeepImages__: - # the magnifier icon - remove_tags = [ - dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})] - else: - remove_tags = [dict(name='img')] - remove_attributes = ['width'] - preprocess_regexps = [(re.compile(r' ', re.DOTALL | re.IGNORECASE), - lambda match: ''), - ] - - oldest_article = 1 - max_articles_per_feed = 100 - __author__ = 'Eddie Lau' - publisher = 'MingPao' - remove_javascript = True - use_embedded_content = False - no_stylesheets = True - language = 'zh' - encoding = 'Big5-HKSCS' - recursions = 0 - conversion_options = {'linearize_tables': True} - timefmt = '' - - def get_dtlocal(self): - dt_utc = datetime.datetime.utcnow() - if __Region__ == 'Hong Kong': - # convert UTC to local hk time - at HKT 4.30am, all news are - # available - dt_local = dt_utc + \ - datetime.timedelta(8.0 / 24) - datetime.timedelta(4.5 / 24) - # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24) - elif __Region__ == 'Vancouver': - # convert UTC to local Vancouver time - at PST time 5.30am, all - # news are available - dt_local = dt_utc + \ - datetime.timedelta(-8.0 / 24) - datetime.timedelta(5.5 / 24) - # dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(5.5/24) - elif __Region__ == 'Toronto': - # convert UTC to local Toronto time - at EST time 8.30am, all news - # are available - dt_local = dt_utc + \ - datetime.timedelta(-5.0 / 24) - datetime.timedelta(8.5 / 24) - # dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(8.5/24) - return dt_local - - def get_fetchdate(self): - if __Date__ != '': - return __Date__ - else: - return self.get_dtlocal().strftime("%Y%m%d") - - def get_fetchformatteddate(self): - if __Date__ != '': - return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8] - else: - return self.get_dtlocal().strftime("%Y-%m-%d") - - def get_fetchyear(self): - if __Date__ != '': - return __Date__[0:4] - else: - return self.get_dtlocal().strftime("%Y") - - def get_fetchmonth(self): - if __Date__ != '': - return __Date__[4:6] - else: - return self.get_dtlocal().strftime("%m") - - def get_fetchday(self): - if __Date__ != '': - return __Date__[6:8] - else: - return self.get_dtlocal().strftime("%d") - - def get_cover_url(self): - if __Region__ == 'Hong Kong': - cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + \ - '_' + self.get_fetchday() + 'gacov.jpg' - elif __Region__ == 'Vancouver': - cover = 'http://www.mingpaovan.com/ftp/News/' + \ - self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg' - elif __Region__ == 'Toronto': - cover = 'http://www.mingpaotor.com/ftp/News/' + \ - self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg' - br = BasicNewsRecipe.get_browser(self) - try: - br.open(cover) - except: - cover = None - return cover - - def parse_index(self): - feeds = [] - dateStr = self.get_fetchdate() - - if __Region__ == 'Hong Kong': - if __UseLife__: - for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'), - (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalgb', 'nal'), - (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalgf', 'nal'), - (u'\u793e\u8a55/\u7b46\u9663 Editorial', - 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalmr', 'nal'), - (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalfa', 'nal'), - (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalca', 'nal'), - (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalta', 'nal'), - (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalea', 'nal'), - (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalsp', 'nal'), - (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalma', 'nal') - ]: - if __InclPremium__ is True: - articles = self.parse_section2_txt(url, keystr) - else: - articles = self.parse_section2(url, keystr) - if articles: - feeds.append((title, articles)) - - if __InclPremium__ is True: - # parse column section articles directly from .txt files - for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl') # noqa - ]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), - (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - else: - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), - (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + - dateStr + '/gbindex.htm'), - (u'\u6559\u80b2 Education', - 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), - (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - # special- editorial - # ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') - # if ed_articles: - # feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) - - for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), - (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + - dateStr + '/caindex.htm'), - (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - # special - finance - # fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') - # fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') - # if fin_articles: - # feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) - - for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - # for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), - # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: - # articles = self.parse_section(url) - # if articles: - # feeds.append((title, articles)) - - # special - entertainment - # ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') - # if ent_articles: - # feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) - - for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') - ]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - if __InclPremium__ is True: - # parse column section articles directly from .txt files - for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl') # noqa - ]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), - (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - elif __Region__ == 'Vancouver': - for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), - (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VBindex.htm'), - (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VDindex.htm'), - (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/HK-VGindex.htm'), - (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VTindex.htm'), - (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VCindex.htm'), - (u'\u7d93\u6fdf Economics', - 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'), - (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VSindex.htm'), - (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/HK-MAindex.htm'), - (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'), ]: - articles = self.parse_section3( - url, 'http://www.mingpaovan.com/') - if articles: - feeds.append((title, articles)) - elif __Region__ == 'Toronto': - for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'), - (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TDindex.htm'), - (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TFindex.htm'), - (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TCAindex.htm'), - (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TTAindex.htm'), - (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/HK-GAindex.htm'), - (u'\u7d93\u6fdf Economics', - 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'), - (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TSindex.htm'), - (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/HK-MAindex.htm'), - (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'), ]: - articles = self.parse_section3( - url, 'http://www.mingpaotor.com/') - if articles: - feeds.append((title, articles)) - return feeds - - # parse from news.mingpao.com - def parse_section(self, url): - dateStr = self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['bullet', 'bullet_grey']}) - current_articles = [] - included_urls = [] - divs.reverse() - for i in divs: - a = i.find('a', href=True) - title = self.tag_to_string(a) - url = a.get('href', False) - url = 'http://news.mingpao.com/' + dateStr + '/' + url - # replace the url to the print-friendly version - if __ParsePFF__ is True: - if url.rfind('Redirect') != -1 and __InclPremium__ is True: - url = re.sub(dateStr + '.*' + dateStr, dateStr, url) - url = re.sub('%2F.*%2F', '/', url) - title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') - url = url.replace('%2Etxt', '_print.htm') - url = url.replace('%5F', '_') - else: - url = url.replace('.htm', '_print.htm') - if url not in included_urls and url.rfind('Redirect') == -1: - current_articles.append( - {'title': title, 'url': url, 'description': '', 'date': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # parse from life.mingpao.com - def parse_section2(self, url, keystr): - br = mechanize.Browser() - br.set_handle_redirect(False) - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): - try: - br.open_novisit(url) - # use printed version of the article - url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - except: - print('skipping a premium article') - current_articles.reverse() - return current_articles - - # parse from text file of life.mingpao.com - def parse_section2_txt(self, url, keystr): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): - # use printed version of the article - url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # parse from www.mingpaovan.com - def parse_section3(self, url, baseUrl): - self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['ListContentLargeLink']}) - current_articles = [] - included_urls = [] - divs.reverse() - for i in divs: - title = self.tag_to_string(i) - urlstr = i.get('href', False) - urlstr = baseUrl + '/' + urlstr.replace('../../../', '') - if urlstr not in included_urls: - current_articles.append( - {'title': title, 'url': urlstr, 'description': '', 'date': ''}) - included_urls.append(urlstr) - current_articles.reverse() - return current_articles - - def parse_ed_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - def parse_fin_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - current_articles = [] - included_urls = [] - for i in a: - # url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - # if url not in included_urls and not url.rfind(dateStr) == -1 and - # url.rfind('index') == -1: - if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1): - title = self.tag_to_string(i) - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - return current_articles - - def parse_ent_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - def parse_col_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # preprocess those .txt and javascript based files - def preprocess_raw_html(self, raw_html, url): - new_html = raw_html - if url.rfind('ftp') != -1 or url.rfind('_print.htm') != -1: - if url.rfind('_print.htm') != -1: - # javascript based file - splitter = re.compile(r'\n') - new_raw_html = 'Untitled' - new_raw_html = new_raw_html + '' - for item in splitter.split(raw_html): - if item.startswith('var heading1 ='): - heading = item.replace('var heading1 = \'', '') - heading = heading.replace('\'', '') - heading = heading.replace(';', '') - new_raw_html = new_raw_html + '
' + heading - if item.startswith('var heading2 ='): - heading = item.replace('var heading2 = \'', '') - heading = heading.replace('\'', '') - heading = heading.replace(';', '') - if heading != '': - new_raw_html = new_raw_html + '
' + heading + '
' - else: - new_raw_html = new_raw_html + '
' - if item.startswith('var content ='): - content = item.replace("var content = ", '') - content = content.replace('\'', '') - content = content.replace(';', '') - new_raw_html = new_raw_html + '
' + content + '
' - if item.startswith('var photocontent ='): - photo = item.replace('var photocontent = \'', '') - photo = photo.replace('\'', '') - photo = photo.replace(';', '') - photo = photo.replace('', '') - photo = photo.replace('', '') - photo = photo.replace('', '') - photo = photo.replace('', '
') - photo = photo.replace('class="photo"', '') - new_raw_html = new_raw_html + '
' + photo + '
' - new_html = new_raw_html + '' - else: - # .txt based file - splitter = re.compile(r'\n') # Match non-digits - new_raw_html = 'Untitled
' - next_is_img_txt = False - title_started = False - title_break_reached = False - met_article_start_char = False - for item in splitter.split(raw_html): - item = item.strip() - # if title already reached but break between title and - # content not yet found, record title_break_reached - if title_started is True and title_break_reached is False and item == '': - title_break_reached = True - # if title reached and title_break_reached and met_article_start_char is False and item is not empty - # start content - elif title_started is True and title_break_reached is True and met_article_start_char is False: - if item != '': - met_article_start_char = True - new_raw_html = new_raw_html + '

' + item + '

\n' - # if item.startswith(u'\u3010'): - # met_article_start_char = True - # new_raw_html = new_raw_html + '

' + item + '

\n' - else: - if next_is_img_txt is False: - if item.startswith("=@"): - print('skip movie link') - elif item.startswith("=?"): - next_is_img_txt = True - new_raw_html += '

\n' - elif item.startswith('=='): - next_is_img_txt = True - if False: - # TODO: check existence of .gif first - newimg = '_' + item[2:].strip() + '.jpg' - new_raw_html += '

\n' - else: - new_raw_html += '

\n' - elif item.startswith('='): - next_is_img_txt = True - if False: - # TODO: check existence of .gif first - newimg = '_' + item[1:].strip() + '.jpg' - new_raw_html += '

\n' - else: - new_raw_html += '

\n' - else: - if next_is_img_txt is False and met_article_start_char is False: - if item != '': - if title_started is False: - # print 'Title started at ', item - new_raw_html = new_raw_html + '

' + item + '\n' - title_started = True - else: - new_raw_html = new_raw_html + item + '\n' - else: - new_raw_html = new_raw_html + item + '

\n' - else: - next_is_img_txt = False - new_raw_html = new_raw_html + item + '\n' - new_html = new_raw_html + '

' - if __HiResImg__ is True: - # TODO: add a _ in front of an image url - if url.rfind('news.mingpao.com') > -1: - imglist = re.findall('src="?.*?jpg"', new_html) - br = mechanize.Browser() - br.set_handle_redirect(False) - for img in imglist: - gifimg = img.replace('jpg"', 'gif"') - try: - br.open_novisit( - url + "/../" + gifimg[5:len(gifimg) - 1]) - new_html = new_html.replace(img, gifimg) - except: - # find the location of the first _ - pos = img.find('_') - if pos > -1: - # if found, insert _ after the first _ - newimg = img[0:pos] + '_' + img[pos:] - new_html = new_html.replace(img, newimg) - else: - # if not found, insert _ after " - new_html = new_html.replace( - img[1:], '"_' + img[1:]) - elif url.rfind('life.mingpao.com') > -1: - imglist = re.findall('src=\'?.*?jpg\'', new_html) - br = mechanize.Browser() - br.set_handle_redirect(False) - # print 'Img list: ', imglist, '\n' - for img in imglist: - # print 'Found img: ', img - gifimg = img.replace('jpg\'', 'gif\'') - try: - gifurl = re.sub(r'dailynews.*txt', '', url) - br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1]) - new_html = new_html.replace(img, gifimg) - except: - pos = img.rfind('/') - newimg = img[0:pos + 1] + '_' + img[pos + 1:] - new_html = new_html.replace(img, newimg) - # repeat with src quoted by double quotes, for text parsed from - # src txt - imglist = re.findall('src="?.*?jpg"', new_html) - for img in imglist: - # print 'Found img: ', img - gifimg = img.replace('jpg"', 'gif"') - try: - # print 'url', url - pos = url.rfind('/') - gifurl = url[:pos + 1] - # print 'try it:', gifurl + gifimg[5:len(gifimg)-1] - br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1]) - new_html = new_html.replace(img, gifimg) - except: - pos = img.find('"') - newimg = img[0:pos + 1] + '_' + img[pos + 1:] - # print 'Use hi-res img', newimg - new_html = new_html.replace(img, newimg) - return new_html - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll(style=True): - del item['width'] - for item in soup.findAll(stype=True): - del item['absmiddle'] - return soup - - def populate_article_metadata(self, article, soup, first): - # thumbnails shouldn't be available if using hi-res images - if __IncludeThumbnails__ and __HiResImg__ is False and first and hasattr(self, 'add_toc_thumbnail'): - img = soup.find('img') - if img is not None: - self.add_toc_thumbnail(article, img['src']) - - try: - if __IncludeSummary__ and len(article.text_summary.strip()) == 0: - # look for content - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent01'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'class': 'content'}) - if not articlebodies: - articlebodies = soup.findAll('div', attrs={'id': 'font'}) - if articlebodies: - for articlebody in articlebodies: - if articlebody: - # the text may or may not be enclosed in

- # tag - paras = articlebody.findAll('p') - if not paras: - paras = articlebody - textFound = False - for p in paras: - if not textFound: - summary_candidate = self.tag_to_string( - p).strip() - summary_candidate = summary_candidate.replace( - u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1) - if len(summary_candidate) > 0: - article.summary = article.text_summary = summary_candidate - textFound = True - else: - # display a simple text - # article.summary = article.text_summary = u'\u66f4\u591a......' - # display word counts - counts = 0 - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent01'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'class': 'content'}) - if not articlebodies: - articlebodies = soup.findAll('div', attrs={'id': 'font'}) - if articlebodies: - for articlebody in articlebodies: - # the text may or may not be enclosed in

tag - paras = articlebody.findAll('p') - if not paras: - paras = articlebody - for p in paras: - summary_candidate = self.tag_to_string(p).strip() - counts += len(summary_candidate) - article.summary = article.text_summary = u'\uff08' + \ - str(counts) + u'\u5b57\uff09' - except: - self.log("Error creating article descriptions") - return - - # override from the one in version 0.8.31 - def create_opf(self, feeds, dir=None): - if dir is None: - dir = self.output_dir - title = self.short_title() - # change 1: allow our own flag to tell if a periodical is to be generated - # also use customed date instead of current time - if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title: - title = title + ' ' + self.get_fetchformatteddate() - # end of change 1 - # change 2: __appname__ replaced by newspaper publisher - __appname__ = self.publisher - mi = MetaInformation(title, [__appname__]) - mi.publisher = __appname__ - mi.author_sort = __appname__ - # change 3: use __MakePeriodical__ flag to tell if a periodical should - # be generated - if __MakePeriodical__ is True: - mi.publication_type = 'periodical:' + \ - self.publication_type + ':' + self.short_title() - else: - mi.publication_type = self.publication_type + ':' + self.short_title() - # mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() - # change 4: in the following, all the nowf() are changed to adjusted time - # This one doesn't matter - mi.timestamp = nowf() - # change 5: skip listing the articles - # article_titles, aseen = [], set() - # for f in feeds: - # for a in f: - # if a.title and a.title not in aseen: - # aseen.add(a.title) - # article_titles.append(force_unicode(a.title, 'utf-8')) - - # mi.comments = self.description - # if not isinstance(mi.comments, unicode): - # mi.comments = mi.comments.decode('utf-8', 'replace') - # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + - # '\n\n'.join(article_titles)) - - language = canonicalize_lang(self.language) - if language is not None: - mi.language = language - # This one affects the pub date shown in kindle title - # mi.pubdate = nowf() - # now appears to need the time field to be > 12.00noon as well - mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int( - self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) - opf_path = os.path.join(dir, 'index.opf') - ncx_path = os.path.join(dir, 'index.ncx') - - opf = OPFCreator(dir, mi) - # Add mastheadImage entry to section - mp = getattr(self, 'masthead_path', None) - if mp is not None and os.access(mp, os.R_OK): - from calibre.ebooks.metadata.opf2 import Guide - ref = Guide.Reference(os.path.basename( - self.masthead_path), os.getcwd()) - ref.type = 'masthead' - ref.title = 'Masthead Image' - opf.guide.append(ref) - - manifest = [os.path.join(dir, 'feed_%d' % i) - for i in range(len(feeds))] - manifest.append(os.path.join(dir, 'index.html')) - manifest.append(os.path.join(dir, 'index.ncx')) - - # Get cover - cpath = getattr(self, 'cover_path', None) - if cpath is None: - pf = open(os.path.join(dir, 'cover.jpg'), 'wb') - if self.default_cover(pf): - cpath = pf.name - if cpath is not None and os.access(cpath, os.R_OK): - opf.cover = cpath - manifest.append(cpath) - - # Get masthead - mpath = getattr(self, 'masthead_path', None) - if mpath is not None and os.access(mpath, os.R_OK): - manifest.append(mpath) - - opf.create_manifest_from_files_in(manifest) - for mani in opf.manifest: - if mani.path.endswith('.ncx'): - mani.id = 'ncx' - if mani.path.endswith('mastheadImage.jpg'): - mani.id = 'masthead-image' - - entries = ['index.html'] - toc = TOC(base_path=dir) - self.play_order_counter = 0 - self.play_order_map = {} - - def feed_index(num, parent): - f = feeds[num] - for j, a in enumerate(f): - if getattr(a, 'downloaded', False): - adir = 'feed_%d/article_%d/' % (num, j) - auth = a.author - if not auth: - auth = None - desc = a.text_summary - if not desc: - desc = None - else: - desc = self.description_limiter(desc) - tt = a.toc_thumbnail if a.toc_thumbnail else None - entries.append('%sindex.html' % adir) - po = self.play_order_map.get(entries[-1], None) - if po is None: - self.play_order_counter += 1 - po = self.play_order_counter - parent.add_item('%sindex.html' % adir, None, - a.title if a.title else _( - 'Untitled Article'), - play_order=po, author=auth, - description=desc, toc_thumbnail=tt) - last = os.path.join( - self.output_dir, ('%sindex.html' % adir).replace('/', os.sep)) - for sp in a.sub_pages: - prefix = os.path.commonprefix([opf_path, sp]) - relp = sp[len(prefix):] - entries.append(relp.replace(os.sep, '/')) - last = sp - - if os.path.exists(last): - with open(last, 'rb') as fi: - src = fi.read().decode('utf-8') - soup = BeautifulSoup(src) - body = soup.find('body') - if body is not None: - prefix = '/'.join('..'for i in range(2 * - len(re.findall(r'link\d+', last)))) - templ = self.navbar.generate(True, num, j, len(f), - not self.has_single_feed, - a.orig_url, __appname__, prefix=prefix, - center=self.center_navbar) - elem = BeautifulSoup(templ.render( - doctype='xhtml').decode('utf-8')).find('div') - body.insert(len(body.contents), elem) - with open(last, 'wb') as fi: - fi.write(type(u'')(soup).encode('utf-8')) - if len(feeds) == 0: - raise Exception('All feeds are empty, aborting.') - - if len(feeds) > 1: - for i, f in enumerate(feeds): - entries.append('feed_%d/index.html' % i) - po = self.play_order_map.get(entries[-1], None) - if po is None: - self.play_order_counter += 1 - po = self.play_order_counter - auth = getattr(f, 'author', None) - if not auth: - auth = None - desc = getattr(f, 'description', None) - if not desc: - desc = None - feed_index(i, toc.add_item('feed_%d/index.html' % i, None, - f.title, play_order=po, description=desc, author=auth)) - - else: - entries.append('feed_%d/index.html' % 0) - feed_index(0, toc) - - for i, p in enumerate(entries): - entries[i] = os.path.join(dir, p.replace('/', os.sep)) - opf.create_spine(entries) - opf.set_toc(toc) - - with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file: - opf.render(opf_file, ncx_file) From 903acbddc00ea7ebe9c1f65cb50155a10afa5083 Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:40:46 +0200 Subject: [PATCH 12/20] Delete ming_pao.recipe --- recipes/ming_pao.recipe | 1180 --------------------------------------- 1 file changed, 1180 deletions(-) delete mode 100644 recipes/ming_pao.recipe diff --git a/recipes/ming_pao.recipe b/recipes/ming_pao.recipe deleted file mode 100644 index f77db49be9..0000000000 --- a/recipes/ming_pao.recipe +++ /dev/null @@ -1,1180 +0,0 @@ -from __future__ import print_function -__license__ = 'GPL v3' -__copyright__ = '2010-2013, Eddie Lau' - -# Region - Hong Kong, Vancouver, Toronto -__Region__ = 'Hong Kong' -# Users of Kindle 3 with limited system-level CJK support -# please replace the following "True" with "False". (Default: True) -__MakePeriodical__ = True -# Turn below to True if your device supports display of CJK titles -# (Default: False) -__UseChineseTitle__ = False -# Set it to False if you want to skip images (Default: True) -__KeepImages__ = True -# Set it to True if you want to include a summary in Kindle's article view -# (Default: True) -__IncludeSummary__ = True -# Set it to True if you want thumbnail images in Kindle's article view -# (Default: True) -__IncludeThumbnails__ = True -# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True) -__UseLife__ = True -# (HK only) It is to disable premium content (Default: False) -__InclPremium__ = False -# (HK only) Turn below to True if you wish to parse articles in news1.mingpao.com with their printer-friendly formats (Default: False) -__ParsePF__ = False -# (HK only) Turn below to True if you wish to parse articles in news1.mingpao.com with text formats (Default: True) -- override __ParsePF__ -__ParseTxt__ = True -# (HK only) Use mobile text version for some articles (Default: False) -__ParseSelectedMobile__ = False -# (HK only) Turn below to True if you wish hi-res images (Default: True) -__HiResImg__ = True -# Override the date returned by the program if specifying a YYYYMMDD below -# (not work if __ParseSelectedMobile__ is True and __UseLife__ is False) -__Date__ = '' - - -''' -Change Log: -2014/10/19: update urls of some web location and top logo -2013/09/28: allow thumbnails even with hi-res images -2012/04/24: improved parsing of news1.mingpao.com content -2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away - from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day - download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. -2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010' -2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt -2011/10/19: fix a bug in txt source parsing -2011/10/17: disable fetching of premium content, also improved txt source parsing -2011/10/04: option to get hi-res photos for the articles -2011/09/21: fetching "column" section is made optional. -2011/09/18: parse "column" section stuff from source text file directly. -2011/09/07: disable "column" section as it is no longer offered free. -2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source - provide options to remove all images in the file -2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages -2011/03/06: add new articles for finance section, also a new section "Columns" -2011/02/28: rearrange the sections - [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles - View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues" - folder in Kindle 3 -2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles - clean up the indentation -2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list - (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) -2010/11/22: add English section, remove eco-news section which is not updated daily, correct - ordering of articles -2010/11/12: add news image and eco-news section -2010/11/08: add parsing of finance section -2010/11/06: temporary work-around for Kindle device having no capability to display unicode - in section/article list. -2010/10/31: skip repeated articles in section pages -''' - -from calibre.utils.date import now as nowf -import os -import datetime -import re -import mechanize -from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup -from calibre.ebooks.metadata.opf2 import OPFCreator -from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.metadata import MetaInformation -from calibre.utils.localization import canonicalize_lang - -# MAIN CLASS - - -class MPRecipe(BasicNewsRecipe): - if __Region__ == 'Hong Kong': - if __UseChineseTitle__ is True: - title = u'\u660e\u5831 (\u9999\u6e2f)' - else: - title = 'Ming Pao - Hong Kong' - description = 'Hong Kong Chinese Newspaper (http://news1.mingpao.com)' - category = 'Chinese, News, Hong Kong' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' # noqa - masthead_url = 'http://news.mingpao.com/image/mingpaonews_logo.png' - remove_tags_before = dict(name='font', attrs={'color': ['navy']}) - keep_only_tags = [dict(name='h1'), - # for entertainment page title - dict(name='font', attrs={ - 'style': ['font-size:14pt; line-height:160%;']}), - # for column articles title - dict(name='font', attrs={'color': ['AA0000']}), - # for heading from txt - dict(attrs={'class': ['heading']}), - # entertainment and column page content - dict(attrs={'id': ['newscontent']}), - dict( - attrs={'id': ['newscontent01', 'newscontent02']}), - # for content from txt - dict(attrs={'class': ['content']}), - dict(attrs={'class': ['photo']}), - dict(name='table', attrs={'width': ['100%'], 'border':['0'], 'cellspacing':[ - '5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com - # images for source from life.mingpao.com - dict(name='img', attrs={ - 'width': ['180'], 'alt':['????']}), - # for images from txt - dict(attrs={'class': ['images']}), - dict(name='table', attrs={'width': ['100%'], 'cellspacing':[ - '0'], 'cellpadding':['0'], 'border':['0']}) # content table in pda site - ] - if __KeepImages__: - remove_tags = [dict(name='style'), - # for the finance page from mpfinance.com - dict(attrs={'id': ['newscontent135']}), - # article date in life.mingpao.com article - dict(name='font', attrs={ - 'size': ['2'], 'color':['666666']}), - # non-article images in life.mingpao.com article - dict(name='img', attrs={ - 'alt': ["æ˜Žć ±ç¶Č站", "æŒ‰æ­€ćˆ—ć°", "é—œé–‰æœŹèŠ–çȘ—"]}), - dict(name='img', attrs={ - 'src': ["../image/top_2.gif"]}) - # dict(name='table') # for content fetched from life.mingpao.com - # dict(name='table', attrs={'width':['98%'], 'cellspacing':['0'], 'cellpadding':['0'], 'border':['0']}) - ] - else: - remove_tags = [dict(name='style'), - # for the finance page from mpfinance.com - dict(attrs={'id': ['newscontent135']}), - # article date in life.mingpao.com article - dict(name='font', attrs={ - 'size': ['2'], 'color':['666666']}), - dict(name='img'), - # dict(name='table') # for content fetched from life.mingpao.com - # dict(name='table', attrs={'width':['98%'], 'cellspacing':['0'], 'cellpadding':['0'], 'border':['0']}) - ] - remove_attributes = ['width'] - preprocess_regexps = [ - (re.compile(r'
', re.DOTALL | re.IGNORECASE), - lambda match: '

'), - (re.compile(r'

', re.DOTALL | re.IGNORECASE), - lambda match: ''), - (re.compile(r'

', re.DOTALL | re.IGNORECASE), # for entertainment page - lambda match: ''), - # skip
after title in life.mingpao.com fetched article - (re.compile(r"

", re.DOTALL | re.IGNORECASE), - lambda match: "
"), - (re.compile(r"

", re.DOTALL | re.IGNORECASE), - lambda match: ""), - (re.compile(r'


', re.DOTALL | re.IGNORECASE), - lambda match: ''), - (re.compile(r'', re.DOTALL | re.IGNORECASE), - lambda match: ''), - (re.compile(r'', re.DOTALL | re.IGNORECASE), - lambda match: ''), - # (re.compile(r'[
.+?]', re.DOTALL|re.IGNORECASE), - # lambda match: '') - ] - elif __Region__ == 'Vancouver': - if __UseChineseTitle__ is True: - title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)' - else: - title = 'Ming Pao - Vancouver' - description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)' - category = 'Chinese, News, Vancouver' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' # noqa - masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif' - keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), - dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[ - '3'], 'cellpadding':['3'], 'id':['tblContent3']}), - dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[ - '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), - ] - if __KeepImages__: - # the magnifier icon - remove_tags = [ - dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})] - else: - remove_tags = [dict(name='img')] - remove_attributes = ['width'] - preprocess_regexps = [(re.compile(r' ', re.DOTALL | re.IGNORECASE), - lambda match: ''), - ] - elif __Region__ == 'Toronto': - if __UseChineseTitle__ is True: - title = u'\u660e\u5831 (\u591a\u502b\u591a)' - else: - title = 'Ming Pao - Toronto' - description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)' - category = 'Chinese, News, Toronto' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' # noqa - masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif' - keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), - dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[ - '3'], 'cellpadding':['3'], 'id':['tblContent3']}), - dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[ - '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), - ] - if __KeepImages__: - # the magnifier icon - remove_tags = [ - dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})] - else: - remove_tags = [dict(name='img')] - remove_attributes = ['width'] - preprocess_regexps = [(re.compile(r' ', re.DOTALL | re.IGNORECASE), - lambda match: ''), - ] - - oldest_article = 1 - max_articles_per_feed = 100 - __author__ = 'Eddie Lau' - publisher = 'MingPao' - remove_javascript = True - use_embedded_content = False - no_stylesheets = True - language = 'zh' - encoding = 'Big5-HKSCS' - recursions = 0 - conversion_options = {'linearize_tables': True} - timefmt = '' - - def get_dtlocal(self): - dt_utc = datetime.datetime.utcnow() - if __Region__ == 'Hong Kong': - # convert UTC to local hk time - at HKT 4.30am, all news are - # available - dt_local = dt_utc + \ - datetime.timedelta(8.0 / 24) - datetime.timedelta(4.5 / 24) - # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24) - elif __Region__ == 'Vancouver': - # convert UTC to local Vancouver time - at PST time 5.30am, all - # news are available - dt_local = dt_utc + \ - datetime.timedelta(-8.0 / 24) - datetime.timedelta(5.5 / 24) - # dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(5.5/24) - elif __Region__ == 'Toronto': - # convert UTC to local Toronto time - at EST time 8.30am, all news - # are available - dt_local = dt_utc + \ - datetime.timedelta(-5.0 / 24) - datetime.timedelta(8.5 / 24) - # dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(8.5/24) - return dt_local - - def get_fetchdate(self): - if __Date__ != '': - return __Date__ - else: - return self.get_dtlocal().strftime("%Y%m%d") - - def get_fetchformatteddate(self): - if __Date__ != '': - return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8] - else: - return self.get_dtlocal().strftime("%Y-%m-%d") - - def get_fetchyear(self): - if __Date__ != '': - return __Date__[0:4] - else: - return self.get_dtlocal().strftime("%Y") - - def get_fetchmonth(self): - if __Date__ != '': - return __Date__[4:6] - else: - return self.get_dtlocal().strftime("%m") - - def get_fetchday(self): - if __Date__ != '': - return __Date__[6:8] - else: - return self.get_dtlocal().strftime("%d") - - # Note: does not work with custom date given by __Date__ - def get_weekday(self): - return self.get_dtlocal().weekday() - - def get_cover_url(self): - if __Region__ == 'Hong Kong': - cover = 'http://news1.mingpao.com/' + self.get_fetchdate() + '/' + \ - self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg' - elif __Region__ == 'Vancouver': - cover = 'http://www.mingpaovan.com/ftp/News/' + \ - self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg' - elif __Region__ == 'Toronto': - cover = 'http://www.mingpaotor.com/ftp/News/' + \ - self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg' - br = BasicNewsRecipe.get_browser(self) - try: - br.open(cover) - except: - cover = None - return cover - - def parse_index(self): - feeds = [] - dateStr = self.get_fetchdate() - - if __Region__ == 'Hong Kong': - if __UseLife__: - for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'), - (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalgb', 'nal'), - (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalgf', 'nal'), - (u'\u793e\u8a55/\u7b46\u9663 Editorial', - 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalmr', 'nal'), - (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalfa', 'nal'), - (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalca', 'nal'), - (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalta', 'nal'), - (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalea', 'nal'), - (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalsp', 'nal'), - (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalma', 'nal') - ]: - if __InclPremium__ is True: - articles = self.parse_section2_txt(url, keystr) - else: - articles = self.parse_section2(url, keystr) - if articles: - feeds.append((title, articles)) - - # new - if __InclPremium__ is True and (self.get_weekday() != 6 or __ParseSelectedMobile__ is False): - # if both not on Sunday and not __ParseSelectedMobile__, go ahead - # parse column section articles directly from .txt files - for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl') # noqa - ]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - if self.get_weekday() != 6: - for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]: - if __ParseTxt__ is False: - articles = self.parse_section(url) - else: - articles = self.parse_section_txt(url, seckey) - if articles: - feeds.append((title, articles)) - else: - if __InclPremium__ is True and __ParseSelectedMobile__ is True: - articles = self.parse_section_mobile( - 'http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1') - if articles: - feeds.append( - (u'\u526f\u520a Supplement', articles)) - else: - for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]: - if __ParseTxt__ is False: - articles = self.parse_section(url) - else: - articles = self.parse_section_txt(url, seckey) - if articles: - feeds.append((title, articles)) - - for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news1.mingpao.com/' + dateStr + '/emindex.htm', 'em')]: - if __ParseTxt__ is False: - articles = self.parse_section(url) - else: - articles = self.parse_section_txt(url, seckey) - if articles: - feeds.append((title, articles)) - # end of new - else: - for title, url, seckey in [(u'\u8981\u805e Headline', 'http://news1.mingpao.com/' + dateStr + '/gaindex.htm', 'ga'), - (u'\u6e2f\u805e Local', 'http://news1.mingpao.com/' + - dateStr + '/gbindex.htm', 'gb'), - (u'\u6559\u80b2 Education', 'http://news1.mingpao.com/' + - dateStr + '/gfindex.htm', 'gf'), - (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news1.mingpao.com/' + dateStr + '/mrindex.htm', 'mr')]: - if __ParseTxt__ is False: - articles = self.parse_section(url) - else: - articles = self.parse_section_txt(url, seckey) - if articles: - feeds.append((title, articles)) - - # special- editorial - # ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') - # if ed_articles: - # feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) - - for title, url, seckey in [(u'\u8ad6\u58c7 Forum', 'http://news1.mingpao.com/' + dateStr + '/faindex.htm', 'fa'), - (u'\u4e2d\u570b China', 'http://news1.mingpao.com/' + - dateStr + '/caindex.htm', 'ca'), - (u'\u570b\u969b World', 'http://news1.mingpao.com/' + dateStr + '/taindex.htm', 'ta')]: - if __ParseTxt__ is False: - articles = self.parse_section(url) - else: - articles = self.parse_section_txt(url, seckey) - if articles: - feeds.append((title, articles)) - - # special - finance - # fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') - # fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') - # if fin_articles: - # feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) - - for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - # for title, url in [('Tech News', 'http://news1.mingpao.com/' + dateStr + '/naindex.htm'), - # (u'\u9ad4\u80b2 Sport', 'http://news1.mingpao.com/' + dateStr + '/spindex.htm')]: - # articles = self.parse_section(url) - # if articles: - # feeds.append((title, articles)) - - # special - entertainment - # ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') - # if ent_articles: - # feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) - - for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') - ]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - if __InclPremium__ is True and (self.get_weekday() != 6 or __ParseSelectedMobile__ is False): - # if both not on Sunday or not __ParseSelectedMobile__, go ahead - # parse column section articles directly from .txt files - for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl') # noqa - ]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - if __InclPremium__ is False or self.get_weekday() != 6: - for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]: - if __ParseTxt__ is False: - articles = self.parse_section(url) - else: - articles = self.parse_section_txt(url, seckey) - if articles: - feeds.append((title, articles)) - else: - if __InclPremium__ is True and __ParseSelectedMobile__ is True: - articles = self.parse_section_mobile( - 'http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1') - if articles: - feeds.append( - (u'\u526f\u520a Supplement', articles)) - else: - for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]: - if __ParseTxt__ is False: - articles = self.parse_section(url) - else: - articles = self.parse_section_txt(url, seckey) - if articles: - feeds.append((title, articles)) - - for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news1.mingpao.com/' + dateStr + '/emindex.htm', 'em')]: - if __ParseTxt__ is False: - articles = self.parse_section(url) - else: - articles = self.parse_section_txt(url, seckey) - if articles: - feeds.append((title, articles)) - - elif __Region__ == 'Vancouver': - for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), - (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VBindex.htm'), - (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VDindex.htm'), - (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/HK-VGindex.htm'), - (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VTindex.htm'), - (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VCindex.htm'), - (u'\u7d93\u6fdf Economics', - 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'), - (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VSindex.htm'), - (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/HK-MAindex.htm'), - (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'), ]: - articles = self.parse_section3( - url, 'http://www.mingpaovan.com/') - if articles: - feeds.append((title, articles)) - elif __Region__ == 'Toronto': - for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'), - (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TDindex.htm'), - (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TFindex.htm'), - (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TCAindex.htm'), - (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TTAindex.htm'), - (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/HK-GAindex.htm'), - (u'\u7d93\u6fdf Economics', - 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'), - (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TSindex.htm'), - (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/HK-MAindex.htm'), - (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'), ]: - articles = self.parse_section3( - url, 'http://www.mingpaotor.com/') - if articles: - feeds.append((title, articles)) - return feeds - - # parse from news1.mingpao.com (web html) - def parse_section(self, url): - dateStr = self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['bullet', 'bullet_grey']}) - current_articles = [] - included_urls = [] - divs.reverse() - for i in divs: - a = i.find('a', href=True) - title = self.tag_to_string(a) - url = a.get('href', False) - url = 'http://news1.mingpao.com/' + dateStr + '/' + url - # replace the url to the alternative version - if __ParsePF__ is True: - # printer-friendly option - if url.rfind('Redirect') != -1 and __InclPremium__ is True: - url = re.sub(dateStr + '.*' + dateStr, dateStr, url) - url = re.sub('%2F.*%2F', '/', url) - if __InclPremium__ is True: - title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') - url = url.replace('%2Etxt', '_print.htm') - url = url.replace('%5F', '_') - else: - url = url.replace('.htm', '_print.htm') - # if url not in included_urls and url.rfind('Redirect') == -1 and - # (__InclPremium__ is False or - # title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1): - if url not in included_urls and (__InclPremium__ is True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': '', 'date': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # parse from news1.mingpao.com (txt) - def parse_section_txt(self, url, ch): - dateStr = self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['bullet', 'bullet_grey']}) - current_articles = [] - included_urls = [] - divs.reverse() - for i in divs: - a = i.find('a', href=True) - title = self.tag_to_string(a) - url = a.get('href', False) - # print 'Base url: ', url - # replace the url to the alternative version - # text version - if url.rfind('Redirect') != -1: - url = 'http://news1.mingpao.com/' + dateStr + '/' + url - # print 'original url: ', url - url = re.sub( - dateStr + '/../cfm/Redirect.cfm.*NewsFile=', 'ftp/WebNews2/', url) - url = re.sub('%2F', '/', url) - if __InclPremium__ is True: - title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') - url = url.replace('%2Etxt', '.txt') - url = url.replace('%5F', '_') - else: - # get the first two char in url as ch - seckey = url[0:2] - url = url.replace('.htm', '.txt') - url = 'http://news1.mingpao.com/ftp/WebNews2/' + \ - dateStr + '/' + ch + '/' + seckey + '/' + url - # print 'updated url: ', url - - if url not in included_urls and (__InclPremium__ is True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1): - # if url not in included_urls and (url.rfind('Redirect') == -1) - # and (__InclPremium__ is False or - # title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': '', 'date': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # parse from life.mingpao.com - def parse_section2(self, url, keystr): - br = mechanize.Browser() - br.set_handle_redirect(False) - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): - try: - br.open_novisit(url) - # use printed version of the article - url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - except: - print('skipping a premium article') - current_articles.reverse() - return current_articles - - # parse from text file of life.mingpao.com - def parse_section2_txt(self, url, keystr): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): - # use printed version of the article - url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # parse from mobile version - def parse_section_mobile(self, base, page): - soup = self.index_to_soup(base + '/' + page) - a = soup.findAll('a', href=True) - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = i.get('href', False) - if url not in included_urls and url.rfind('HotNews2.cfm') != -1: - current_articles.append( - {'title': title, 'url': base + '/' + url, 'description': ''}) - included_urls.append(url) - return current_articles - - # parse from www.mingpaovan.com - def parse_section3(self, url, baseUrl): - self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['ListContentLargeLink']}) - current_articles = [] - included_urls = [] - divs.reverse() - for i in divs: - title = self.tag_to_string(i) - urlstr = i.get('href', False) - urlstr = baseUrl + '/' + urlstr.replace('../../../', '') - if urlstr not in included_urls: - current_articles.append( - {'title': title, 'url': urlstr, 'description': '', 'date': ''}) - included_urls.append(urlstr) - current_articles.reverse() - return current_articles - - def parse_ed_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - def parse_fin_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - current_articles = [] - included_urls = [] - for i in a: - # url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - # if url not in included_urls and not url.rfind(dateStr) == -1 and - # url.rfind('index') == -1: - if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1): - title = self.tag_to_string(i) - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - return current_articles - - def parse_ent_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - def parse_col_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # preprocess those .txt and javascript based files - def preprocess_raw_html(self, raw_html, url): - new_html = raw_html - if url.rfind('ftp') != -1 or url.rfind('_print.htm') != -1: - if url.rfind('_print.htm') != -1: - # javascript based file - splitter = re.compile(r'\n') - new_raw_html = 'Untitled' - new_raw_html = new_raw_html + '' - for item in splitter.split(raw_html): - if item.startswith('var heading1 ='): - heading = item.replace('var heading1 = \'', '') - heading = heading.replace('\'', '') - heading = heading.replace(';', '') - new_raw_html = new_raw_html + '
' + heading - if item.startswith('var heading2 ='): - heading = item.replace('var heading2 = \'', '') - heading = heading.replace('\'', '') - heading = heading.replace(';', '') - if heading != '': - new_raw_html = new_raw_html + '
' + heading + '
' - else: - new_raw_html = new_raw_html + '
' - if item.startswith('var content ='): - content = item.replace("var content = ", '') - content = content.replace('\'', '') - content = content.replace(';', '') - new_raw_html = new_raw_html + '
' + content + '
' - if item.startswith('var photocontent ='): - photo = item.replace('var photocontent = \'', '') - photo = photo.replace('\'', '') - photo = photo.replace(';', '') - photo = photo.replace('', '') - photo = photo.replace('', '') - photo = photo.replace('', '') - photo = photo.replace('', '
') - photo = photo.replace('class="photo"', '') - new_raw_html = new_raw_html + '
' + photo + '
' - new_html = new_raw_html + '' - else: - # .txt based file - splitter = re.compile(r'\n') # Match non-digits - new_raw_html = 'Untitled
' - next_is_img_txt = False - title_started = False - title_break_reached = False - met_article_start_char = False - for item in splitter.split(raw_html): - item = item.strip() - # if title already reached but break between title and - # content not yet found, record title_break_reached - if title_started is True and title_break_reached is False and item == '': - title_break_reached = True - # if title reached and title_break_reached and met_article_start_char is False and item is not empty - # start content - elif title_started is True and title_break_reached is True and met_article_start_char is False: - if item != '': - met_article_start_char = True - new_raw_html = new_raw_html + '

' + item + '

\n' - # if item.startswith(u'\u3010'): - # met_article_start_char = True - # new_raw_html = new_raw_html + '

' + item + '

\n' - else: - if next_is_img_txt is False: - if item.startswith("=@"): - print('skip movie link') - elif item.startswith("=?"): - next_is_img_txt = True - new_raw_html += '

\n' - elif item.startswith('=='): - next_is_img_txt = True - if False: - # TODO: check existence of .gif first - newimg = '_' + item[2:].strip() + '.jpg' - new_raw_html += '

\n' - else: - new_raw_html += '

\n' - elif item.startswith('='): - next_is_img_txt = True - if False: - # TODO: check existence of .gif first - newimg = '_' + item[1:].strip() + '.jpg' - new_raw_html += '

\n' - else: - new_raw_html += '

\n' - else: - if next_is_img_txt is False and met_article_start_char is False: - if item != '': - if title_started is False: - # print 'Title started at ', item - new_raw_html = new_raw_html + '

' + item + '\n' - title_started = True - else: - new_raw_html = new_raw_html + item + '\n' - else: - new_raw_html = new_raw_html + item + '

\n' - else: - next_is_img_txt = False - new_raw_html = new_raw_html + item + '\n' - new_html = new_raw_html + '

' - if __HiResImg__ is True: - # TODO: add a _ in front of an image url - if url.rfind('news1.mingpao.com') > -1: - imglist = re.findall('src="?.*?jpg"', new_html) - br = mechanize.Browser() - br.set_handle_redirect(False) - for img in imglist: - gifimg = img.replace('jpg"', 'gif"') - try: - br.open_novisit( - url + "/../" + gifimg[5:len(gifimg) - 1]) - new_html = new_html.replace(img, gifimg) - except: - if __ParseTxt__ is False: - # find the location of the first _ - pos = img.find('_') - if pos > -1: - # if found, insert _ after the first _ - newimg = img[0:pos] + '_' + img[pos:] - new_html = new_html.replace(img, newimg) - else: - # if not found, insert _ after " - new_html = new_html.replace( - img[1:], '"_' + img[1:]) - else: - # insert to front - # print 'imgstr: ', img - pos = img.find('_') - new_html = new_html.replace(img[5:], '_' + img[5:]) - - elif url.rfind('life.mingpao.com') > -1: - imglist = re.findall('src=\'?.*?jpg\'', new_html) - br = mechanize.Browser() - br.set_handle_redirect(False) - # print 'Img list: ', imglist, '\n' - for img in imglist: - # print 'Found img: ', img - gifimg = img.replace('jpg\'', 'gif\'') - try: - gifurl = re.sub(r'dailynews.*txt', '', url) - br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1]) - new_html = new_html.replace(img, gifimg) - except: - pos = img.rfind('/') - newimg = img[0:pos + 1] + '_' + img[pos + 1:] - new_html = new_html.replace(img, newimg) - # repeat with src quoted by double quotes, for text parsed from - # src txt - imglist = re.findall('src="?.*?jpg"', new_html) - for img in imglist: - # print 'Found img: ', img - gifimg = img.replace('jpg"', 'gif"') - try: - # print 'url', url - pos = url.rfind('/') - gifurl = url[:pos + 1] - # print 'try it:', gifurl + gifimg[5:len(gifimg)-1] - br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1]) - new_html = new_html.replace(img, gifimg) - except: - pos = img.find('"') - newimg = img[0:pos + 1] + '_' + img[pos + 1:] - # print 'Use hi-res img', newimg - new_html = new_html.replace(img, newimg) - # test - # print new_html - return new_html - - def preprocess_html(self, soup): - for mobiletitle in soup.findAll('font', attrs={'color': ['navy']}): - mobiletitle.name = 'h1' - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll(style=True): - del item['width'] - for item in soup.findAll(stype=True): - del item['absmiddle'] - return soup - - def populate_article_metadata(self, article, soup, first): - if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'): - img = soup.find('img') - if img is not None: - self.add_toc_thumbnail(article, img['src']) - - try: - if __IncludeSummary__ and len(article.text_summary.strip()) == 0: - # look for content - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent01'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'class': 'content'}) - if not articlebodies: - articlebodies = soup.findAll('div', attrs={'id': 'font'}) - if articlebodies: - for articlebody in articlebodies: - if articlebody: - # the text may or may not be enclosed in

- # tag - paras = articlebody.findAll('p') - if not paras: - paras = articlebody - textFound = False - for p in paras: - if not textFound: - summary_candidate = self.tag_to_string( - p).strip() - summary_candidate = summary_candidate.replace( - u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1) - if len(summary_candidate) > 0: - article.summary = article.text_summary = summary_candidate - textFound = True - else: - # display a simple text - # article.summary = article.text_summary = u'\u66f4\u591a......' - # display word counts - counts = 0 - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent01'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'class': 'content'}) - if not articlebodies: - articlebodies = soup.findAll('div', attrs={'id': 'font'}) - if articlebodies: - for articlebody in articlebodies: - # the text may or may not be enclosed in

tag - paras = articlebody.findAll('p') - if not paras: - paras = articlebody - for p in paras: - summary_candidate = self.tag_to_string(p).strip() - counts += len(summary_candidate) - article.summary = article.text_summary = u'\uff08' + \ - str(counts) + u'\u5b57\uff09' - except: - self.log("Error creating article descriptions") - return - - # override from the one in version 0.8.31 - def create_opf(self, feeds, dir=None): - if dir is None: - dir = self.output_dir - title = self.short_title() - # change 1: allow our own flag to tell if a periodical is to be generated - # also use customed date instead of current time - if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title: - title = title + ' ' + self.get_fetchformatteddate() - # end of change 1 - # change 2: __appname__ replaced by newspaper publisher - __appname__ = self.publisher - mi = MetaInformation(title, [__appname__]) - mi.publisher = __appname__ - mi.author_sort = __appname__ - # change 3: use __MakePeriodical__ flag to tell if a periodical should - # be generated - if __MakePeriodical__ is True: - mi.publication_type = 'periodical:' + \ - self.publication_type + ':' + self.short_title() - else: - mi.publication_type = self.publication_type + ':' + self.short_title() - # change 4: in the following, all the nowf() are changed to adjusted time - # This one doesn't matter - mi.timestamp = nowf() - # change 5: skip listing the articles - # article_titles, aseen = [], set() - # for f in feeds: - # for a in f: - # if a.title and a.title not in aseen: - # aseen.add(a.title) - # article_titles.append(force_unicode(a.title, 'utf-8')) - - # mi.comments = self.description - # if not isinstance(mi.comments, unicode): - # mi.comments = mi.comments.decode('utf-8', 'replace') - # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + - # '\n\n'.join(article_titles)) - - language = canonicalize_lang(self.language) - if language is not None: - mi.language = language - # This one affects the pub date shown in kindle title - # mi.pubdate = nowf() - # now appears to need the time field to be > 12.00noon as well - mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int( - self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) - opf_path = os.path.join(dir, 'index.opf') - ncx_path = os.path.join(dir, 'index.ncx') - - opf = OPFCreator(dir, mi) - # Add mastheadImage entry to section - mp = getattr(self, 'masthead_path', None) - if mp is not None and os.access(mp, os.R_OK): - from calibre.ebooks.metadata.opf2 import Guide - ref = Guide.Reference(os.path.basename( - self.masthead_path), os.getcwd()) - ref.type = 'masthead' - ref.title = 'Masthead Image' - opf.guide.append(ref) - - manifest = [os.path.join(dir, 'feed_%d' % i) - for i in range(len(feeds))] - manifest.append(os.path.join(dir, 'index.html')) - manifest.append(os.path.join(dir, 'index.ncx')) - - # Get cover - cpath = getattr(self, 'cover_path', None) - if cpath is None: - pf = open(os.path.join(dir, 'cover.jpg'), 'wb') - if self.default_cover(pf): - cpath = pf.name - if cpath is not None and os.access(cpath, os.R_OK): - opf.cover = cpath - manifest.append(cpath) - - # Get masthead - mpath = getattr(self, 'masthead_path', None) - if mpath is not None and os.access(mpath, os.R_OK): - manifest.append(mpath) - - opf.create_manifest_from_files_in(manifest) - for mani in opf.manifest: - if mani.path.endswith('.ncx'): - mani.id = 'ncx' - if mani.path.endswith('mastheadImage.jpg'): - mani.id = 'masthead-image' - - entries = ['index.html'] - toc = TOC(base_path=dir) - self.play_order_counter = 0 - self.play_order_map = {} - - def feed_index(num, parent): - f = feeds[num] - for j, a in enumerate(f): - if getattr(a, 'downloaded', False): - adir = 'feed_%d/article_%d/' % (num, j) - auth = a.author - if not auth: - auth = None - desc = a.text_summary - if not desc: - desc = None - else: - desc = self.description_limiter(desc) - tt = a.toc_thumbnail if a.toc_thumbnail else None - entries.append('%sindex.html' % adir) - po = self.play_order_map.get(entries[-1], None) - if po is None: - self.play_order_counter += 1 - po = self.play_order_counter - parent.add_item('%sindex.html' % adir, None, - a.title if a.title else _( - 'Untitled Article'), - play_order=po, author=auth, - description=desc, toc_thumbnail=tt) - last = os.path.join( - self.output_dir, ('%sindex.html' % adir).replace('/', os.sep)) - for sp in a.sub_pages: - prefix = os.path.commonprefix([opf_path, sp]) - relp = sp[len(prefix):] - entries.append(relp.replace(os.sep, '/')) - last = sp - - if os.path.exists(last): - with open(last, 'rb') as fi: - src = fi.read().decode('utf-8') - soup = BeautifulSoup(src) - body = soup.find('body') - if body is not None: - prefix = '/'.join('..'for i in range(2 * - len(re.findall(r'link\d+', last)))) - templ = self.navbar.generate(True, num, j, len(f), - not self.has_single_feed, - a.orig_url, __appname__, prefix=prefix, - center=self.center_navbar) - elem = BeautifulSoup(templ.render( - doctype='xhtml').decode('utf-8')).find('div') - body.insert(len(body.contents), elem) - with open(last, 'wb') as fi: - fi.write(type(u'')(soup).encode('utf-8')) - if len(feeds) == 0: - raise Exception('All feeds are empty, aborting.') - - if len(feeds) > 1: - for i, f in enumerate(feeds): - entries.append('feed_%d/index.html' % i) - po = self.play_order_map.get(entries[-1], None) - if po is None: - self.play_order_counter += 1 - po = self.play_order_counter - auth = getattr(f, 'author', None) - if not auth: - auth = None - desc = getattr(f, 'description', None) - if not desc: - desc = None - feed_index(i, toc.add_item('feed_%d/index.html' % i, None, - f.title, play_order=po, description=desc, author=auth)) - - else: - entries.append('feed_%d/index.html' % 0) - feed_index(0, toc) - - for i, p in enumerate(entries): - entries[i] = os.path.join(dir, p.replace('/', os.sep)) - opf.create_spine(entries) - opf.set_toc(toc) - - with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file: - opf.render(opf_file, ncx_file) From 2ca33f5da16a94615d1c7229a455f6a76d5e1635 Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:41:11 +0200 Subject: [PATCH 13/20] Delete nanfengchuang.recipe --- recipes/nanfengchuang.recipe | 106 ----------------------------------- 1 file changed, 106 deletions(-) delete mode 100644 recipes/nanfengchuang.recipe diff --git a/recipes/nanfengchuang.recipe b/recipes/nanfengchuang.recipe deleted file mode 100644 index 18a7be556f..0000000000 --- a/recipes/nanfengchuang.recipe +++ /dev/null @@ -1,106 +0,0 @@ -# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai -from __future__ import unicode_literals -from calibre.web.feeds.recipes import BasicNewsRecipe -from lxml import html - -__license__ = 'GPL v3' - - -class Nfcmag(BasicNewsRecipe): - - __author__ = '2014, Chen Wei ' - title = 'Nan Feng Chuang / South Reviews Magazine' - description = ''' -South Reviews Magazine, established in 1985, is a Guangzhou-based political and -economic biweekly. South Reviews enjoys a reputation of being fair and objective, with graceful -narration, insightful expression among its readers, mostly government -officials, economic leaders and intellectuals. It has been praised as “the No.1 -Political& Economical Magazine in China”. - -The US magazine Time described South Reviews as "a highbrow news magazine". -Other international media organizations such as BBC and NHK have conducted -tracking shots of South Reviews journalists, to record their unique value -special position in China’s media industry. Harvard-Yenching Library, Stanford -University's East Asia Library and UC Berkeley Library have collections of the -magazine since its first issue, taking them as an important source to -understand China's economic and social reform. - -Since 2008, South Reviews has been committed to transforming into a -research-based media organization. Most of its editors, reporters and -contributors have remarkably strong academic backgrounds, coming from Peking -University, Tsinghua University, London School of Economics and Political -Science, the Chinese University of Hong Kong, Renmin University of China, and -other well-known institutions. The magazine has established research divisions, -including the State Policy Research Center and the Brand Promotion Research -Center, working in cooperation with well-known academic institutions and -providing valuable research reports for governments and companies. - -''' - language = 'zh' - encoding = 'UTF-8' - publisher = 'South Reviews Magazine' - publication_type = 'magazine' - category = 'news, Business, China' - timefmt = ' [%a, %d %b, %Y]' - needs_subscription = False - - remove_tags = [dict(attrs={'class': ['side-left', 'side-right', - 'breadcrumbs', 'score', 'weboNav']}), - dict(attrs={'id': ['header', 'footer']}), - dict(name=['script', 'noscript', 'style'])] - no_stylesheets = True - remove_javascript = True - current_issue_url = "" - current_issue_cover = "" - - def parse_index(self): - - baseurl = 'http://www.nfcmag.com/' - raw = self.index_to_soup('http://www.nfcmag.com/magazine', raw=True) - soup_start = html.fromstring(raw) - - els = soup_start.xpath("""//div[contains(@class, 'lastest-magazine') - and contains(@class, 'comBox')] - //a[@href and not(@id) and not(child::img)] - """) - for x in els: - issueurl = x.get('href') - if not issueurl.lower().startswith('http://'): - issueurl = baseurl + issueurl - break - - raw = self.index_to_soup(issueurl, raw=True) - soup_issue = html.fromstring(raw) - - coverimg = soup_issue.xpath("""//div[contains(@class, 'lastest-magazine') - and contains(@class, 'comBox')] - //img[@*] """) - imgurl = coverimg[0].get('src') - if not imgurl.lower().startswith('http://'): - imgurl = baseurl + imgurl - self.current_issue_cover = imgurl - feeds = [] - - sections = soup_issue.xpath("""//div[contains(@class, 'article-box') - and contains(@class, 'comBox')] """) - for sec in sections: - pages = sec.xpath('.//h5') - sec_title = sec.xpath('.//h4')[0].text_content() - self.log('Found section:', sec_title) - articles = [] - for x in pages: - url = x.xpath('.//a')[0].get('href') - if not url.lower().startswith('http://'): - url = baseurl + url - url = url[:-5] + '-s.html' # to print view - - title = x.text_content() - - articles.append({'title': title, 'url': url, 'date': None}) - - if articles: - feeds.append((sec_title, articles)) - return feeds - - def get_cover_url(self): - return self.current_issue_cover From 7814e8b8ec8fc8c57f2adc297b5d6fcb4c57d120 Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:41:28 +0200 Subject: [PATCH 14/20] Delete ming_pao_vancouver.recipe --- recipes/ming_pao_vancouver.recipe | 1018 ----------------------------- 1 file changed, 1018 deletions(-) delete mode 100644 recipes/ming_pao_vancouver.recipe diff --git a/recipes/ming_pao_vancouver.recipe b/recipes/ming_pao_vancouver.recipe deleted file mode 100644 index f8b1e9309d..0000000000 --- a/recipes/ming_pao_vancouver.recipe +++ /dev/null @@ -1,1018 +0,0 @@ -from __future__ import print_function -__license__ = 'GPL v3' -__copyright__ = '2010-2011, Eddie Lau' - -# Region - Hong Kong, Vancouver, Toronto -__Region__ = 'Vancouver' -# Users of Kindle 3 with limited system-level CJK support -# please replace the following "True" with "False". (Default: True) -__MakePeriodical__ = True -# Turn below to True if your device supports display of CJK titles -# (Default: False) -__UseChineseTitle__ = False -# Set it to False if you want to skip images (Default: True) -__KeepImages__ = True -# Set it to True if you want to include a summary in Kindle's article view -# (Default: False) -__IncludeSummary__ = False -# Set it to True if you want thumbnail images in Kindle's article view -# (Default: True) -__IncludeThumbnails__ = True -# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True) -__UseLife__ = True -# (HK only) It is to disable premium content (Default: False) -__InclPremium__ = False -# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True) -__ParsePFF__ = True -# (HK only) Turn below to True if you wish hi-res images (Default: False) -__HiResImg__ = False -# Override the date returned by the program if specifying a YYYYMMDD below -__Date__ = '' - - -''' -Change Log: -2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away - from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day - download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device. -2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010' -2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt -2011/10/19: fix a bug in txt source parsing -2011/10/17: disable fetching of premium content, also improved txt source parsing -2011/10/04: option to get hi-res photos for the articles -2011/09/21: fetching "column" section is made optional. -2011/09/18: parse "column" section stuff from source text file directly. -2011/09/07: disable "column" section as it is no longer offered free. -2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source - provide options to remove all images in the file -2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages -2011/03/06: add new articles for finance section, also a new section "Columns" -2011/02/28: rearrange the sections - [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles - View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues" - folder in Kindle 3 -2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles - clean up the indentation -2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list - (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) -2010/11/22: add English section, remove eco-news section which is not updated daily, correct - ordering of articles -2010/11/12: add news image and eco-news section -2010/11/08: add parsing of finance section -2010/11/06: temporary work-around for Kindle device having no capability to display unicode - in section/article list. -2010/10/31: skip repeated articles in section pages -''' - -from calibre.utils.date import now as nowf -import os -import datetime -import re -import mechanize -from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup -from calibre.ebooks.metadata.opf2 import OPFCreator -from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.metadata import MetaInformation -from calibre.utils.localization import canonicalize_lang - -# MAIN CLASS - - -class MPRecipe(BasicNewsRecipe): - if __Region__ == 'Hong Kong': - if __UseChineseTitle__ is True: - title = u'\u660e\u5831 (\u9999\u6e2f)' - else: - title = 'Ming Pao - Hong Kong' - description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' - category = 'Chinese, News, Hong Kong' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' # noqa - masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' - keep_only_tags = [dict(name='h1'), - # for entertainment page title - dict(name='font', attrs={ - 'style': ['font-size:14pt; line-height:160%;']}), - # for column articles title - dict(name='font', attrs={'color': ['AA0000']}), - # for heading from txt - dict(attrs={'class': ['heading']}), - # entertainment and column page content - dict(attrs={'id': ['newscontent']}), - dict( - attrs={'id': ['newscontent01', 'newscontent02']}), - # for content from txt - dict(attrs={'class': ['content']}), - dict(attrs={'class': ['photo']}), - dict(name='table', attrs={'width': ['100%'], 'border':['0'], 'cellspacing':[ - '5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com - # images for source from life.mingpao.com - dict(name='img', attrs={ - 'width': ['180'], 'alt':['????']}), - # for images from txt - dict(attrs={'class': ['images']}) - ] - if __KeepImages__: - remove_tags = [dict(name='style'), - # for the finance page from mpfinance.com - dict(attrs={'id': ['newscontent135']}), - # article date in life.mingpao.com article - dict(name='font', attrs={ - 'size': ['2'], 'color':['666666']}), - # dict(name='table') # for content fetched from - # life.mingpao.com - ] - else: - remove_tags = [dict(name='style'), - # for the finance page from mpfinance.com - dict(attrs={'id': ['newscontent135']}), - # article date in life.mingpao.com article - dict(name='font', attrs={ - 'size': ['2'], 'color':['666666']}), - dict(name='img'), - # dict(name='table') # for content fetched from - # life.mingpao.com - ] - remove_attributes = ['width'] - preprocess_regexps = [ - (re.compile(r'
', re.DOTALL | re.IGNORECASE), - lambda match: '

'), - (re.compile(r'

', re.DOTALL | re.IGNORECASE), - lambda match: ''), - (re.compile(r'

', re.DOTALL | re.IGNORECASE), # for entertainment page - lambda match: ''), - # skip
after title in life.mingpao.com fetched article - (re.compile(r"

", re.DOTALL | re.IGNORECASE), - lambda match: "
"), - (re.compile(r"

", re.DOTALL | re.IGNORECASE), - lambda match: "") - ] - elif __Region__ == 'Vancouver': - if __UseChineseTitle__ is True: - title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)' - else: - title = 'Ming Pao - Vancouver' - description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)' - category = 'Chinese, News, Vancouver' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' # noqa - masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif' - keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), - dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[ - '3'], 'cellpadding':['3'], 'id':['tblContent3']}), - dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[ - '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), - ] - if __KeepImages__: - # the magnifier icon - remove_tags = [ - dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})] - else: - remove_tags = [dict(name='img')] - remove_attributes = ['width'] - preprocess_regexps = [(re.compile(r' ', re.DOTALL | re.IGNORECASE), - lambda match: ''), - ] - elif __Region__ == 'Toronto': - if __UseChineseTitle__ is True: - title = u'\u660e\u5831 (\u591a\u502b\u591a)' - else: - title = 'Ming Pao - Toronto' - description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)' - category = 'Chinese, News, Toronto' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' # noqa - masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif' - keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}), - dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[ - '3'], 'cellpadding':['3'], 'id':['tblContent3']}), - dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[ - '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}), - ] - if __KeepImages__: - # the magnifier icon - remove_tags = [ - dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})] - else: - remove_tags = [dict(name='img')] - remove_attributes = ['width'] - preprocess_regexps = [(re.compile(r' ', re.DOTALL | re.IGNORECASE), - lambda match: ''), - ] - - oldest_article = 1 - max_articles_per_feed = 100 - __author__ = 'Eddie Lau' - publisher = 'MingPao' - remove_javascript = True - use_embedded_content = False - no_stylesheets = True - language = 'zh' - encoding = 'Big5-HKSCS' - recursions = 0 - conversion_options = {'linearize_tables': True} - timefmt = '' - - def get_dtlocal(self): - dt_utc = datetime.datetime.utcnow() - if __Region__ == 'Hong Kong': - # convert UTC to local hk time - at HKT 4.30am, all news are - # available - dt_local = dt_utc + \ - datetime.timedelta(8.0 / 24) - datetime.timedelta(4.5 / 24) - # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24) - elif __Region__ == 'Vancouver': - # convert UTC to local Vancouver time - at PST time 5.30am, all - # news are available - dt_local = dt_utc + \ - datetime.timedelta(-8.0 / 24) - datetime.timedelta(5.5 / 24) - # dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(5.5/24) - elif __Region__ == 'Toronto': - # convert UTC to local Toronto time - at EST time 8.30am, all news - # are available - dt_local = dt_utc + \ - datetime.timedelta(-5.0 / 24) - datetime.timedelta(8.5 / 24) - # dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(8.5/24) - return dt_local - - def get_fetchdate(self): - if __Date__ != '': - return __Date__ - else: - return self.get_dtlocal().strftime("%Y%m%d") - - def get_fetchformatteddate(self): - if __Date__ != '': - return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8] - else: - return self.get_dtlocal().strftime("%Y-%m-%d") - - def get_fetchyear(self): - if __Date__ != '': - return __Date__[0:4] - else: - return self.get_dtlocal().strftime("%Y") - - def get_fetchmonth(self): - if __Date__ != '': - return __Date__[4:6] - else: - return self.get_dtlocal().strftime("%m") - - def get_fetchday(self): - if __Date__ != '': - return __Date__[6:8] - else: - return self.get_dtlocal().strftime("%d") - - def get_cover_url(self): - if __Region__ == 'Hong Kong': - cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + \ - '_' + self.get_fetchday() + 'gacov.jpg' - elif __Region__ == 'Vancouver': - cover = 'http://www.mingpaovan.com/ftp/News/' + \ - self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg' - elif __Region__ == 'Toronto': - cover = 'http://www.mingpaotor.com/ftp/News/' + \ - self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg' - br = BasicNewsRecipe.get_browser(self) - try: - br.open(cover) - except: - cover = None - return cover - - def parse_index(self): - feeds = [] - dateStr = self.get_fetchdate() - - if __Region__ == 'Hong Kong': - if __UseLife__: - for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'), - (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalgb', 'nal'), - (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalgf', 'nal'), - (u'\u793e\u8a55/\u7b46\u9663 Editorial', - 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalmr', 'nal'), - (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalfa', 'nal'), - (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalca', 'nal'), - (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalta', 'nal'), - (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalea', 'nal'), - (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalsp', 'nal'), - (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + - dateStr + '&Category=nalma', 'nal') - ]: - if __InclPremium__ is True: - articles = self.parse_section2_txt(url, keystr) - else: - articles = self.parse_section2(url, keystr) - if articles: - feeds.append((title, articles)) - - if __InclPremium__ is True: - # parse column section articles directly from .txt files - for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl') # noqa - ]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), - (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - else: - for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), - (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + - dateStr + '/gbindex.htm'), - (u'\u6559\u80b2 Education', - 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), - (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - # special- editorial - # ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') - # if ed_articles: - # feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) - - for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), - (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + - dateStr + '/caindex.htm'), - (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - # special - finance - # fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') - # fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') - # if fin_articles: - # feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) - - for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - # for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), - # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: - # articles = self.parse_section(url) - # if articles: - # feeds.append((title, articles)) - - # special - entertainment - # ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') - # if ent_articles: - # feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) - - for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') - ]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - if __InclPremium__ is True: - # parse column section articles directly from .txt files - for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl') # noqa - ]: - articles = self.parse_section2_txt(url, keystr) - if articles: - feeds.append((title, articles)) - - for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), - (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: - articles = self.parse_section(url) - if articles: - feeds.append((title, articles)) - - elif __Region__ == 'Vancouver': - for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), - (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VBindex.htm'), - (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VDindex.htm'), - (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/HK-VGindex.htm'), - (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VTindex.htm'), - (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VCindex.htm'), - (u'\u7d93\u6fdf Economics', - 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'), - (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/VSindex.htm'), - (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + - dateStr + '/HK-MAindex.htm'), - (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'), ]: - articles = self.parse_section3( - url, 'http://www.mingpaovan.com/') - if articles: - feeds.append((title, articles)) - elif __Region__ == 'Toronto': - for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'), - (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TDindex.htm'), - (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TFindex.htm'), - (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TCAindex.htm'), - (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TTAindex.htm'), - (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/HK-GAindex.htm'), - (u'\u7d93\u6fdf Economics', - 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'), - (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/TSindex.htm'), - (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + - dateStr + '/HK-MAindex.htm'), - (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'), ]: - articles = self.parse_section3( - url, 'http://www.mingpaotor.com/') - if articles: - feeds.append((title, articles)) - return feeds - - # parse from news.mingpao.com - def parse_section(self, url): - dateStr = self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['bullet', 'bullet_grey']}) - current_articles = [] - included_urls = [] - divs.reverse() - for i in divs: - a = i.find('a', href=True) - title = self.tag_to_string(a) - url = a.get('href', False) - url = 'http://news.mingpao.com/' + dateStr + '/' + url - # replace the url to the print-friendly version - if __ParsePFF__ is True: - if url.rfind('Redirect') != -1 and __InclPremium__ is True: - url = re.sub(dateStr + '.*' + dateStr, dateStr, url) - url = re.sub('%2F.*%2F', '/', url) - title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') - url = url.replace('%2Etxt', '_print.htm') - url = url.replace('%5F', '_') - else: - url = url.replace('.htm', '_print.htm') - if url not in included_urls and url.rfind('Redirect') == -1: - current_articles.append( - {'title': title, 'url': url, 'description': '', 'date': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # parse from life.mingpao.com - def parse_section2(self, url, keystr): - br = mechanize.Browser() - br.set_handle_redirect(False) - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): - try: - br.open_novisit(url) - # use printed version of the article - url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - except: - print('skipping a premium article') - current_articles.reverse() - return current_articles - - # parse from text file of life.mingpao.com - def parse_section2_txt(self, url, keystr): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): - # use printed version of the article - url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # parse from www.mingpaovan.com - def parse_section3(self, url, baseUrl): - self.get_fetchdate() - soup = self.index_to_soup(url) - divs = soup.findAll(attrs={'class': ['ListContentLargeLink']}) - current_articles = [] - included_urls = [] - divs.reverse() - for i in divs: - title = self.tag_to_string(i) - urlstr = i.get('href', False) - urlstr = baseUrl + '/' + urlstr.replace('../../../', '') - if urlstr not in included_urls: - current_articles.append( - {'title': title, 'url': urlstr, 'description': '', 'date': ''}) - included_urls.append(urlstr) - current_articles.reverse() - return current_articles - - def parse_ed_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - def parse_fin_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - current_articles = [] - included_urls = [] - for i in a: - # url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - # if url not in included_urls and not url.rfind(dateStr) == -1 and - # url.rfind('index') == -1: - if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1): - title = self.tag_to_string(i) - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - return current_articles - - def parse_ent_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - def parse_col_section(self, url): - self.get_fetchdate() - soup = self.index_to_soup(url) - a = soup.findAll('a', href=True) - a.reverse() - current_articles = [] - included_urls = [] - for i in a: - title = self.tag_to_string(i) - url = 'http://life.mingpao.com/cfm/' + i.get('href', False) - if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1): - current_articles.append( - {'title': title, 'url': url, 'description': ''}) - included_urls.append(url) - current_articles.reverse() - return current_articles - - # preprocess those .txt and javascript based files - def preprocess_raw_html(self, raw_html, url): - new_html = raw_html - if url.rfind('ftp') != -1 or url.rfind('_print.htm') != -1: - if url.rfind('_print.htm') != -1: - # javascript based file - splitter = re.compile(r'\n') - new_raw_html = 'Untitled' - new_raw_html = new_raw_html + '' - for item in splitter.split(raw_html): - if item.startswith('var heading1 ='): - heading = item.replace('var heading1 = \'', '') - heading = heading.replace('\'', '') - heading = heading.replace(';', '') - new_raw_html = new_raw_html + '
' + heading - if item.startswith('var heading2 ='): - heading = item.replace('var heading2 = \'', '') - heading = heading.replace('\'', '') - heading = heading.replace(';', '') - if heading != '': - new_raw_html = new_raw_html + '
' + heading + '
' - else: - new_raw_html = new_raw_html + '
' - if item.startswith('var content ='): - content = item.replace("var content = ", '') - content = content.replace('\'', '') - content = content.replace(';', '') - new_raw_html = new_raw_html + '
' + content + '
' - if item.startswith('var photocontent ='): - photo = item.replace('var photocontent = \'', '') - photo = photo.replace('\'', '') - photo = photo.replace(';', '') - photo = photo.replace('', '') - photo = photo.replace('', '') - photo = photo.replace('', '') - photo = photo.replace('', '
') - photo = photo.replace('class="photo"', '') - new_raw_html = new_raw_html + '
' + photo + '
' - new_html = new_raw_html + '' - else: - # .txt based file - splitter = re.compile(r'\n') # Match non-digits - new_raw_html = 'Untitled
' - next_is_img_txt = False - title_started = False - title_break_reached = False - met_article_start_char = False - for item in splitter.split(raw_html): - item = item.strip() - # if title already reached but break between title and - # content not yet found, record title_break_reached - if title_started is True and title_break_reached is False and item == '': - title_break_reached = True - # if title reached and title_break_reached and met_article_start_char is False and item is not empty - # start content - elif title_started is True and title_break_reached is True and met_article_start_char is False: - if item != '': - met_article_start_char = True - new_raw_html = new_raw_html + '

' + item + '

\n' - # if item.startswith(u'\u3010'): - # met_article_start_char = True - # new_raw_html = new_raw_html + '

' + item + '

\n' - else: - if next_is_img_txt is False: - if item.startswith("=@"): - print('skip movie link') - elif item.startswith("=?"): - next_is_img_txt = True - new_raw_html += '

\n' - elif item.startswith('=='): - next_is_img_txt = True - if False: - # TODO: check existence of .gif first - newimg = '_' + item[2:].strip() + '.jpg' - new_raw_html += '

\n' - else: - new_raw_html += '

\n' - elif item.startswith('='): - next_is_img_txt = True - if False: - # TODO: check existence of .gif first - newimg = '_' + item[1:].strip() + '.jpg' - new_raw_html += '

\n' - else: - new_raw_html += '

\n' - else: - if next_is_img_txt is False and met_article_start_char is False: - if item != '': - if title_started is False: - # print 'Title started at ', item - new_raw_html = new_raw_html + '

' + item + '\n' - title_started = True - else: - new_raw_html = new_raw_html + item + '\n' - else: - new_raw_html = new_raw_html + item + '

\n' - else: - next_is_img_txt = False - new_raw_html = new_raw_html + item + '\n' - new_html = new_raw_html + '

' - if __HiResImg__ is True: - # TODO: add a _ in front of an image url - if url.rfind('news.mingpao.com') > -1: - imglist = re.findall('src="?.*?jpg"', new_html) - br = mechanize.Browser() - br.set_handle_redirect(False) - for img in imglist: - gifimg = img.replace('jpg"', 'gif"') - try: - br.open_novisit( - url + "/../" + gifimg[5:len(gifimg) - 1]) - new_html = new_html.replace(img, gifimg) - except: - # find the location of the first _ - pos = img.find('_') - if pos > -1: - # if found, insert _ after the first _ - newimg = img[0:pos] + '_' + img[pos:] - new_html = new_html.replace(img, newimg) - else: - # if not found, insert _ after " - new_html = new_html.replace( - img[1:], '"_' + img[1:]) - elif url.rfind('life.mingpao.com') > -1: - imglist = re.findall('src=\'?.*?jpg\'', new_html) - br = mechanize.Browser() - br.set_handle_redirect(False) - # print 'Img list: ', imglist, '\n' - for img in imglist: - # print 'Found img: ', img - gifimg = img.replace('jpg\'', 'gif\'') - try: - gifurl = re.sub(r'dailynews.*txt', '', url) - br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1]) - new_html = new_html.replace(img, gifimg) - except: - pos = img.rfind('/') - newimg = img[0:pos + 1] + '_' + img[pos + 1:] - new_html = new_html.replace(img, newimg) - # repeat with src quoted by double quotes, for text parsed from - # src txt - imglist = re.findall('src="?.*?jpg"', new_html) - for img in imglist: - # print 'Found img: ', img - gifimg = img.replace('jpg"', 'gif"') - try: - # print 'url', url - pos = url.rfind('/') - gifurl = url[:pos + 1] - # print 'try it:', gifurl + gifimg[5:len(gifimg)-1] - br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1]) - new_html = new_html.replace(img, gifimg) - except: - pos = img.find('"') - newimg = img[0:pos + 1] + '_' + img[pos + 1:] - # print 'Use hi-res img', newimg - new_html = new_html.replace(img, newimg) - return new_html - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll(style=True): - del item['width'] - for item in soup.findAll(stype=True): - del item['absmiddle'] - return soup - - def populate_article_metadata(self, article, soup, first): - # thumbnails shouldn't be available if using hi-res images - if __IncludeThumbnails__ and __HiResImg__ is False and first and hasattr(self, 'add_toc_thumbnail'): - img = soup.find('img') - if img is not None: - self.add_toc_thumbnail(article, img['src']) - - try: - if __IncludeSummary__ and len(article.text_summary.strip()) == 0: - # look for content - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent01'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'class': 'content'}) - if not articlebodies: - articlebodies = soup.findAll('div', attrs={'id': 'font'}) - if articlebodies: - for articlebody in articlebodies: - if articlebody: - # the text may or may not be enclosed in

- # tag - paras = articlebody.findAll('p') - if not paras: - paras = articlebody - textFound = False - for p in paras: - if not textFound: - summary_candidate = self.tag_to_string( - p).strip() - summary_candidate = summary_candidate.replace( - u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1) - if len(summary_candidate) > 0: - article.summary = article.text_summary = summary_candidate - textFound = True - else: - # display a simple text - # article.summary = article.text_summary = u'\u66f4\u591a......' - # display word counts - counts = 0 - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'id': 'newscontent01'}) - if not articlebodies: - articlebodies = soup.findAll( - 'div', attrs={'class': 'content'}) - if not articlebodies: - articlebodies = soup.findAll('div', attrs={'id': 'font'}) - if articlebodies: - for articlebody in articlebodies: - # the text may or may not be enclosed in

tag - paras = articlebody.findAll('p') - if not paras: - paras = articlebody - for p in paras: - summary_candidate = self.tag_to_string(p).strip() - counts += len(summary_candidate) - article.summary = article.text_summary = u'\uff08' + \ - str(counts) + u'\u5b57\uff09' - except: - self.log("Error creating article descriptions") - return - - # override from the one in version 0.8.31 - def create_opf(self, feeds, dir=None): - if dir is None: - dir = self.output_dir - title = self.short_title() - # change 1: allow our own flag to tell if a periodical is to be generated - # also use customed date instead of current time - if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title: - title = title + ' ' + self.get_fetchformatteddate() - # end of change 1 - # change 2: __appname__ replaced by newspaper publisher - __appname__ = self.publisher - mi = MetaInformation(title, [__appname__]) - mi.publisher = __appname__ - mi.author_sort = __appname__ - # change 3: use __MakePeriodical__ flag to tell if a periodical should - # be generated - if __MakePeriodical__ is True: - mi.publication_type = 'periodical:' + \ - self.publication_type + ':' + self.short_title() - else: - mi.publication_type = self.publication_type + ':' + self.short_title() - # mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() - # change 4: in the following, all the nowf() are changed to adjusted time - # This one doesn't matter - mi.timestamp = nowf() - # change 5: skip listing the articles - # article_titles, aseen = [], set() - # for f in feeds: - # for a in f: - # if a.title and a.title not in aseen: - # aseen.add(a.title) - # article_titles.append(force_unicode(a.title, 'utf-8')) - - # mi.comments = self.description - # if not isinstance(mi.comments, unicode): - # mi.comments = mi.comments.decode('utf-8', 'replace') - # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + - # '\n\n'.join(article_titles)) - - language = canonicalize_lang(self.language) - if language is not None: - mi.language = language - # This one affects the pub date shown in kindle title - # mi.pubdate = nowf() - # now appears to need the time field to be > 12.00noon as well - mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int( - self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) - opf_path = os.path.join(dir, 'index.opf') - ncx_path = os.path.join(dir, 'index.ncx') - - opf = OPFCreator(dir, mi) - # Add mastheadImage entry to section - mp = getattr(self, 'masthead_path', None) - if mp is not None and os.access(mp, os.R_OK): - from calibre.ebooks.metadata.opf2 import Guide - ref = Guide.Reference(os.path.basename( - self.masthead_path), os.getcwd()) - ref.type = 'masthead' - ref.title = 'Masthead Image' - opf.guide.append(ref) - - manifest = [os.path.join(dir, 'feed_%d' % i) - for i in range(len(feeds))] - manifest.append(os.path.join(dir, 'index.html')) - manifest.append(os.path.join(dir, 'index.ncx')) - - # Get cover - cpath = getattr(self, 'cover_path', None) - if cpath is None: - pf = open(os.path.join(dir, 'cover.jpg'), 'wb') - if self.default_cover(pf): - cpath = pf.name - if cpath is not None and os.access(cpath, os.R_OK): - opf.cover = cpath - manifest.append(cpath) - - # Get masthead - mpath = getattr(self, 'masthead_path', None) - if mpath is not None and os.access(mpath, os.R_OK): - manifest.append(mpath) - - opf.create_manifest_from_files_in(manifest) - for mani in opf.manifest: - if mani.path.endswith('.ncx'): - mani.id = 'ncx' - if mani.path.endswith('mastheadImage.jpg'): - mani.id = 'masthead-image' - - entries = ['index.html'] - toc = TOC(base_path=dir) - self.play_order_counter = 0 - self.play_order_map = {} - - def feed_index(num, parent): - f = feeds[num] - for j, a in enumerate(f): - if getattr(a, 'downloaded', False): - adir = 'feed_%d/article_%d/' % (num, j) - auth = a.author - if not auth: - auth = None - desc = a.text_summary - if not desc: - desc = None - else: - desc = self.description_limiter(desc) - tt = a.toc_thumbnail if a.toc_thumbnail else None - entries.append('%sindex.html' % adir) - po = self.play_order_map.get(entries[-1], None) - if po is None: - self.play_order_counter += 1 - po = self.play_order_counter - parent.add_item('%sindex.html' % adir, None, - a.title if a.title else _( - 'Untitled Article'), - play_order=po, author=auth, - description=desc, toc_thumbnail=tt) - last = os.path.join( - self.output_dir, ('%sindex.html' % adir).replace('/', os.sep)) - for sp in a.sub_pages: - prefix = os.path.commonprefix([opf_path, sp]) - relp = sp[len(prefix):] - entries.append(relp.replace(os.sep, '/')) - last = sp - - if os.path.exists(last): - with open(last, 'rb') as fi: - src = fi.read().decode('utf-8') - soup = BeautifulSoup(src) - body = soup.find('body') - if body is not None: - prefix = '/'.join('..'for i in range(2 * - len(re.findall(r'link\d+', last)))) - templ = self.navbar.generate(True, num, j, len(f), - not self.has_single_feed, - a.orig_url, __appname__, prefix=prefix, - center=self.center_navbar) - elem = BeautifulSoup(templ.render( - doctype='xhtml').decode('utf-8')).find('div') - body.insert(len(body.contents), elem) - with open(last, 'wb') as fi: - fi.write(type(u'')(soup).encode('utf-8')) - if len(feeds) == 0: - raise Exception('All feeds are empty, aborting.') - - if len(feeds) > 1: - for i, f in enumerate(feeds): - entries.append('feed_%d/index.html' % i) - po = self.play_order_map.get(entries[-1], None) - if po is None: - self.play_order_counter += 1 - po = self.play_order_counter - auth = getattr(f, 'author', None) - if not auth: - auth = None - desc = getattr(f, 'description', None) - if not desc: - desc = None - feed_index(i, toc.add_item('feed_%d/index.html' % i, None, - f.title, play_order=po, description=desc, author=auth)) - - else: - entries.append('feed_%d/index.html' % 0) - feed_index(0, toc) - - for i, p in enumerate(entries): - entries[i] = os.path.join(dir, p.replace('/', os.sep)) - opf.create_spine(entries) - opf.set_toc(toc) - - with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file: - opf.render(opf_file, ncx_file) From 4cfd52708a73731052a8f569dd90f3fc3e9d01f1 Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:41:44 +0200 Subject: [PATCH 15/20] Delete cnd_weekly.recipe --- recipes/cnd_weekly.recipe | 74 --------------------------------------- 1 file changed, 74 deletions(-) delete mode 100644 recipes/cnd_weekly.recipe diff --git a/recipes/cnd_weekly.recipe b/recipes/cnd_weekly.recipe deleted file mode 100644 index 7566ec9548..0000000000 --- a/recipes/cnd_weekly.recipe +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python2 - -__license__ = 'GPL v3' -__copyright__ = '2010, Derek Liang ' -''' -cnd.org -''' -import re - -from calibre.web.feeds.news import BasicNewsRecipe - - -class TheCND(BasicNewsRecipe): - - title = 'CND Weekly' - __author__ = 'Derek Liang' - description = '' - INDEX = 'http://cnd.org' - language = 'zh' - conversion_options = {'linearize_tables': True} - - remove_tags_before = dict(name='div', id='articleHead') - remove_tags_after = dict(id='copyright') - remove_tags = [dict(name='table', attrs={'align': 'right'}), dict(name='img', attrs={ - 'src': 'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})] - no_stylesheets = True - - preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: ''), - (re.compile('', - re.DOTALL), lambda m: ''), - ] - - def print_version(self, url): - if url.find('news/article.php') >= 0: - return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url) - else: - return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url) - - def parse_index(self): - soup = self.index_to_soup(self.INDEX) - - feeds = [] - articles = {} - - for a in soup.findAll('a', attrs={'target': '_cnd'}): - url = a['href'] - if url.find('article.php') < 0: - continue - if url.startswith('/'): - url = 'http://cnd.org' + url - title = self.tag_to_string(a) - date = a.nextSibling - if not re.search('cm', date): - continue - self.log('\tFound article: ', title, 'at', url, '@', date) - if (date is not None) and len(date) > 2: - if date not in articles: - articles[date] = [] - articles[date].append( - {'title': title, 'url': url, 'description': '', 'date': ''}) - self.log('\t\tAppend to : ', date) - - sorted_articles = sorted(articles) - while sorted_articles: - mostCurrent = sorted_articles.pop() - self.title = 'CND ' + mostCurrent - feeds.append((self.title, articles[mostCurrent])) - - return feeds - - def populate_article_metadata(self, article, soup, first): - header = soup.find('h3') - self.log('header: ' + self.tag_to_string(header)) - pass From 4e1bbdbd0180ae6e0a20b5418310520162084a81 Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:42:00 +0200 Subject: [PATCH 16/20] Delete cnd.recipe --- recipes/cnd.recipe | 73 ---------------------------------------------- 1 file changed, 73 deletions(-) delete mode 100644 recipes/cnd.recipe diff --git a/recipes/cnd.recipe b/recipes/cnd.recipe deleted file mode 100644 index 320f355e01..0000000000 --- a/recipes/cnd.recipe +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python2 - -__license__ = 'GPL v3' -__copyright__ = '2010, Derek Liang ' -''' -cnd.org -''' -import re - -from calibre.web.feeds.news import BasicNewsRecipe - - -class TheCND(BasicNewsRecipe): - - title = 'CND' - __author__ = 'Derek Liang' - description = '' - INDEX = 'http://cnd.org' - language = 'zh' - conversion_options = {'linearize_tables': True} - - remove_tags_before = dict(name='div', id='articleHead') - remove_tags_after = dict(id='copyright') - remove_tags = [dict(name='table', attrs={'align': 'right'}), dict(name='img', attrs={ - 'src': 'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})] - no_stylesheets = True - - preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: ''), - (re.compile('
', - re.DOTALL), lambda m: ''), - ] - - def print_version(self, url): - if url.find('news/article.php') >= 0: - return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url) - else: - return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url) - - def parse_index(self): - soup = self.index_to_soup(self.INDEX) - - feeds = [] - articles = {} - - for a in soup.findAll('a', attrs={'target': '_cnd'}): - url = a['href'] - if url.find('article.php') < 0: - continue - if url.startswith('/'): - url = 'http://cnd.org' + url - title = self.tag_to_string(a) - self.log('\tFound article: ', title, 'at', url) - date = a.nextSibling - if re.search('cm', date): - continue - if (date is not None) and len(date) > 2: - if date not in articles: - articles[date] = [] - articles[date].append( - {'title': title, 'url': url, 'description': '', 'date': ''}) - self.log('\t\tAppend to : ', date) - - mostCurrent = sorted(articles).pop() - self.title = 'CND ' + mostCurrent - - feeds.append((self.title, articles[mostCurrent])) - - return feeds - - def populate_article_metadata(self, article, soup, first): - header = soup.find('h3') - self.log('header: ' + self.tag_to_string(header)) - pass From f4e807cab02f344eabf1875f4a0563bcac9e77eb Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:42:16 +0200 Subject: [PATCH 17/20] Delete caijing.recipe --- recipes/caijing.recipe | 90 ------------------------------------------ 1 file changed, 90 deletions(-) delete mode 100644 recipes/caijing.recipe diff --git a/recipes/caijing.recipe b/recipes/caijing.recipe deleted file mode 100644 index e0825862b2..0000000000 --- a/recipes/caijing.recipe +++ /dev/null @@ -1,90 +0,0 @@ -import re -from calibre.web.feeds.recipes import BasicNewsRecipe - -__license__ = 'GPL v3' - - -class Caijing(BasicNewsRecipe): - - '''based on the recipe wrote by Eric Chen at 2011''' - - __author__ = '2014, Chen Wei ' - title = 'Caijing Magazine' - description = ''' - Founded in 1998, the fortnightly CAIJING Magazine has firmly established - itself as a news authority and leading voice for business and financial - issues in China. - - CAIJING Magazine closely tracks the most important aspects of China's - economic reforms, developments and policy changes, as well as major events - in the capital markets. It also offers a broad international perspective - through first-hand reporting on international political and economic - issues. - - CAIJING Magazine is China's most widely read business and finance magazine, - with a circulation of 225,000 per issue. It boasts top-level readers from - government, business and academic circles.''' - language = 'zh' - encoding = 'UTF-8' - publisher = 'Caijing Magazine' - publication_type = 'magazine' - category = 'news, Business, China' - timefmt = ' [%a, %d %b, %Y]' - needs_subscription = True - - remove_tags = [dict(attrs={'class': ['head_nav', 'mcont_logo', 'header', - 'bottom', 'footer', 'magazine_ipad', 'cjartShare', 'ar_about', - 'main_rt', 'mcont_nav', 'new']}), - dict(attrs={'id': ['articlePl']}), - dict(name=['script', 'noscript', 'style'])] - no_stylesheets = True - remove_javascript = True - current_issue_url = "" - current_issue_cover = "" - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - if self.username is not None and self.password is not None: - br.open('http://service.caijing.com.cn/usermanage/login') - br.select_form(name='mainLoginForm') - br['username'] = self.username - br['password'] = self.password - br.submit() - return br - - def parse_index(self): - soup_start = self.index_to_soup('http://magazine.caijing.com.cn/') - jumpurl = soup_start.find('script').contents[0].split() - for line in jumpurl: - if 'http' in line.lower(): - issuesurl = line.split('"')[1] - break - - soup_issues = self.index_to_soup(issuesurl) - # find the latest issue - div = soup_issues.find('div', attrs={'class': 'fmcon'}) - current_issue_url = div.find('a', href=True)['href'] - - soup = self.index_to_soup(current_issue_url) - coverimg = soup.find('div', {'class': 'zzfm_img'}) - self.current_issue_cover = coverimg.find('img')['src'] - - feeds = [] - for section in soup.findAll('div', - attrs={'class': re.compile(r'(fmwz_ml|zzlm_nr)2?$')}): - section_title = self.tag_to_string(section.find('div', - attrs={'class': re.compile(r'(lmnav_bt|zzlm_bt)1?$')})) - self.log('Found section:', section_title) - articles = [] - for post in section.findAll('div', - attrs={'class': re.compile(r'(fmwz_bt|zzlm_nr_bt)')}): - title = self.tag_to_string(post) - url = post.find('a')['href'] - articles.append({'title': title, 'url': url, 'date': None}) - - if articles: - feeds.append((section_title, articles)) - return feeds - - def get_cover_url(self): - return self.current_issue_cover From 98777bf9da5e0c2ce4ef2ef71903529ddb1dcd86 Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:42:46 +0200 Subject: [PATCH 18/20] Delete bbc_chinese.recipe --- recipes/bbc_chinese.recipe | 46 -------------------------------------- 1 file changed, 46 deletions(-) delete mode 100644 recipes/bbc_chinese.recipe diff --git a/recipes/bbc_chinese.recipe b/recipes/bbc_chinese.recipe deleted file mode 100644 index 61531db3c9..0000000000 --- a/recipes/bbc_chinese.recipe +++ /dev/null @@ -1,46 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe - - -class AdvancedUserRecipe1277443634(BasicNewsRecipe): - title = u'BBC Chinese' - oldest_article = 7 - max_articles_per_feed = 100 - - feeds = [ - (u'\u4e3b\u9875', u'http://www.bbc.co.uk/zhongwen/simp/index.xml'), - (u'\u56fd\u9645\u65b0\u95fb', - u'http://www.bbc.co.uk/zhongwen/simp/world/index.xml'), - (u'\u4e24\u5cb8\u4e09\u5730', - u'http://www.bbc.co.uk/zhongwen/simp/china/index.xml'), - (u'\u91d1\u878d\u8d22\u7ecf', - u'http://www.bbc.co.uk/zhongwen/simp/business/index.xml'), - (u'\u7f51\u4e0a\u4e92\u52a8', - u'http://www.bbc.co.uk/zhongwen/simp/interactive/index.xml'), - (u'\u97f3\u89c6\u56fe\u7247', - u'http://www.bbc.co.uk/zhongwen/simp/multimedia/index.xml'), - (u'\u5206\u6790\u8bc4\u8bba', - u'http://www.bbc.co.uk/zhongwen/simp/indepth/index.xml') - ] - extra_css = ''' - @font-face {font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\n - body {margin-right: 8pt; font-family: 'DroidFont', serif;}\n - h1 {font-family: 'DroidFont', serif;}\n - .articledescription {font-family: 'DroidFont', serif;} - ''' - __author__ = 'rty' - __version__ = '1.0' - language = 'zh' - pubisher = 'British Broadcasting Corporation' - description = 'BBC news in Chinese' - category = 'News, Chinese' - remove_javascript = True - use_embedded_content = False - no_stylesheets = True - encoding = 'UTF-8' - conversion_options = {'linearize_tables': True} - masthead_url = 'http://wscdn.bbc.co.uk/zhongwen/simp/images/1024/brand.jpg' - keep_only_tags = [ - dict(name='h1'), - dict(name='p', attrs={'class': ['primary-topic', 'summary']}), - dict(name='div', attrs={'class': ['bodytext', 'datestamp']}), - ] From bad72317420c65076e875a90cac534b7ab6f16fa Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:43:52 +0200 Subject: [PATCH 19/20] Delete elperiodico_catalan.recipe --- recipes/elperiodico_catalan.recipe | 73 ------------------------------ 1 file changed, 73 deletions(-) delete mode 100644 recipes/elperiodico_catalan.recipe diff --git a/recipes/elperiodico_catalan.recipe b/recipes/elperiodico_catalan.recipe deleted file mode 100644 index 6d8be7749a..0000000000 --- a/recipes/elperiodico_catalan.recipe +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = '30 October 2010, Jordi Balcells based on an earlier recipe by Darko Miletic ' -''' -elperiodico.cat -''' - -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag - - -def new_tag(soup, name, attrs=()): - impl = getattr(soup, 'new_tag', None) - if impl is not None: - return impl(name, attrs=dict(attrs)) - return Tag(soup, name, attrs=attrs or None) - - -class ElPeriodico_cat(BasicNewsRecipe): - title = 'El Periodico de Catalunya' - __author__ = 'Jordi Balcells/Darko Miletic' - description = 'Noticies des de Catalunya' - publisher = 'elperiodico.cat' - category = 'news, politics, Spain, Catalunya' - oldest_article = 2 - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - delay = 1 - encoding = 'cp1252' - language = 'ca' - - html2lrf_options = [ - '--comment', description, '--category', category, '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + \ - '"\ncomments="' + description + '"\ntags="' + category + '"' - - feeds = [(u'Portada', u'http://www.elperiodico.cat/ca/rss/rss_portada.xml'), - (u'Internacional', u'http://www.elperiodico.cat/ca/rss/internacional/rss.xml'), - (u'Societat', u'http://www.elperiodico.cat/ca/rss/societat/rss.xml'), - (u'Ci\xe8ncia i tecnologia', - u'http://www.elperiodico.cat/ca/rss/ciencia-i-tecnologia/rss.xml'), - (u'Esports', u'http://www.elperiodico.cat/ca/rss/esports/rss.xml'), - (u'Gent', u'http://www.elperiodico.cat/ca/rss/gent/rss.xml'), - (u'Opini\xf3', u'http://www.elperiodico.cat/ca/rss/opinio/rss.xml'), - (u'Pol\xedtica', u'http://www.elperiodico.cat/ca/rss/politica/rss.xml'), - (u'Barcelona', u'http://www.elperiodico.cat/ca/rss/barcelona/rss.xml'), - (u'Economia', u'http://www.elperiodico.cat/ca/rss/economia/rss.xml'), - (u'Cultura i espectacles', - u'http://www.elperiodico.cat/ca/rss/cultura-i-espectacles/rss.xml'), - (u'Tele', u'http://www.elperiodico.cat/ca/rss/tele/rss.xml')] - - keep_only_tags = [dict(name='div', attrs={'class': 'titularnoticia'}), - dict(name='div', attrs={'class': 'noticia_completa'})] - - remove_tags = [dict(name='div', attrs={'class': ['opcionb', 'opcionb last', 'columna_noticia']}), - dict(name='span', attrs={'class': 'opcionesnoticia'}) - ] - - def print_version(self, url): - return url.replace('/default.asp?', '/print.asp?') - - def preprocess_html(self, soup): - mcharset = new_tag(soup, 'meta', [ - ("http-equiv", "Content-Type"), ("content", "text/html; charset=utf-8")]) - soup.head.insert(0, mcharset) - for item in soup.findAll(style=True): - del item['style'] - return soup From f3111e2a72fcc21b0ee5b873e80ebce15486e64a Mon Sep 17 00:00:00 2001 From: Jony <23194385+jony0008@users.noreply.github.com> Date: Sun, 29 Mar 2020 19:45:49 +0200 Subject: [PATCH 20/20] Delete china_press.recipe --- recipes/china_press.recipe | 69 -------------------------------------- 1 file changed, 69 deletions(-) delete mode 100644 recipes/china_press.recipe diff --git a/recipes/china_press.recipe b/recipes/china_press.recipe deleted file mode 100644 index 39946e9490..0000000000 --- a/recipes/china_press.recipe +++ /dev/null @@ -1,69 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe - - -class AdvancedUserRecipe1277228948(BasicNewsRecipe): - title = u'China Press USA' - oldest_article = 7 - max_articles_per_feed = 100 - - __author__ = 'rty' - __version__ = '1.0' - language = 'zh' - pubisher = 'www.chinapressusa.com' - description = 'Overseas Chinese Network Newspaper in the USA' - category = 'News in Chinese, USA' - remove_javascript = True - use_embedded_content = False - no_stylesheets = True - encoding = 'UTF-8' - conversion_options = {'linearize_tables': True} - masthead_url = 'http://www.chinapressusa.com/common/images/logo.gif' - extra_css = ''' - @font-face { font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\n - body { - margin-right: 8pt; - font-family: 'DroidFont', serif;} - h1 {font-family: 'DroidFont', serif, sans-serif} - .show {font-family: 'DroidFont', serif, sans-serif} - ''' - feeds = [ - (u'\u65b0\u95fb\u9891\u9053', u'http://news.uschinapress.com/news.xml'), - (u'\u534e\u4eba\u9891\u9053', u'http://chinese.uschinapress.com/chinese.xml'), - (u'\u8bc4\u8bba\u9891\u9053', u'http://review.uschinapress.com/review.xml'), - ] - keep_only_tags = [ - dict(name='div', attrs={'class': 'show'}), - ] - remove_tags = [ - # dict(name='table', attrs={'class':'xle'}), - dict(name='div', attrs={'class': 'time'}), - ] - remove_tags_after = [ - dict(name='div', attrs={'class': 'bank17'}), - # dict(name='a', attrs={'class':'ab12'}), - ] - - def append_page(self, soup, appendtag, position): - pager = soup.find('div', attrs={'id': 'displaypagenum'}) - if pager: - nexturl = self.INDEX + pager.a['href'] - soup2 = self.index_to_soup(nexturl) - texttag = soup2.find('div', attrs={'class': 'show'}) - for it in texttag.findAll(style=True): - del it['style'] - newpos = len(texttag.contents) - self.append_page(soup2, texttag, newpos) - texttag.extract() - appendtag.insert(position, texttag) - - def preprocess_html(self, soup): - mtag = '\n' - soup.head.insert(0, mtag) - - for item in soup.findAll(style=True): - del item['style'] - self.append_page(soup, soup.body, 3) - pager = soup.find('div', attrs={'id': 'displaypagenum'}) - if pager: - pager.extract() - return soup