From 1b20a25051132bcc2d0ab7f6a3fd910f0b591f93 Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 09:52:29 +0200
Subject: [PATCH 01/20] Delete berria.recipe

---
 recipes/berria.recipe | 44 -------------------------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 recipes/berria.recipe
diff --git a/recipes/berria.recipe b/recipes/berria.recipe
deleted file mode 100644
index bb04e63388..0000000000
--- a/recipes/berria.recipe
+++ /dev/null
@@ -1,44 +0,0 @@
-__license__ = 'GPL v3'
-__copyright__ = '2012, Alayn Gortazar <zutoin at gmail dot com>'
-'''
-www.berria.info
-'''
-
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class Berria(BasicNewsRecipe):
-    title = 'Berria'
-    __author__ = 'Alayn Gortazar'
-    description = 'Euskal Herriko euskarazko egunkaria'
-    publisher = 'Berria'
-    category = 'news, politics, sports, Basque Country'
-    oldest_article = 2
-    max_articles_per_feed = 100
-    no_stylesheets = True
-    use_embedded_content = False
-    language = 'eu'
-    remove_empty_feeds = True
-    masthead_url = 'http://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/Berria_Logo.svg/400px-Berria_Logo.svg.png'
-
-    keep_only_tags = [
-        dict(id='goiburua'),
-        dict(name='div', attrs={'class': ['ber_ikus']}),
-        dict(name='section', attrs={'class': 'ber_ikus'})
-    ]
-    remove_tags = [
-        dict(name='a', attrs={'class': 'iruzkinak'}),
-        dict(name='div', attrs={'class': 'laguntzaileak'})
-    ]
-
-    extra_css = '#goiburua{font-weight: bold} .zintiloa{font-size: small} .sarrera{color:#666} .titularra{font-size: x-large} .sarrera{font-weight: bold} .argazoin{color:#666; font-size: small}'  # noqa
-
-    feeds = [
-        (u'Edizioa jarraia', u'http://berria.info/rss/ediziojarraia.xml'),
-        (u'Iritzia', u'http://berria.info/rss/iritzia.xml'),
-        (u'Euskal Herria', u'http://berria.info/rss/euskalherria.xml'),
-        (u'Ekonomia', u'http://berria.info/rss/ekonomia.xml'),
-        (u'Mundua', u'http://berria.info/rss/mundua.xml'),
-        (u'Kirola', u'http://berria.info/rss/kirola.xml'),
-        (u'Plaza', u'http://berria.info/rss/plaza.xml')
-    ]

From fde2794af4b6464a7374eb228269784906c274de Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 09:53:11 +0200
Subject: [PATCH 02/20] Delete bbc_arabic.recipe

---
 recipes/bbc_arabic.recipe | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 recipes/bbc_arabic.recipe

diff --git a/recipes/bbc_arabic.recipe b/recipes/bbc_arabic.recipe
deleted file mode 100644
index 0884f111f4..0000000000
--- a/recipes/bbc_arabic.recipe
+++ /dev/null
@@ -1,21 +0,0 @@
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class BBCArabic(BasicNewsRecipe):
-    title = u'BBC Arabic Middle East'
-    oldest_article = 7
-    max_articles_per_feed = 100
-    extra_css = 'body { text-align: right;  direction:rtl; } '
-    auto_cleanup = True
-    language = 'ar'
-    __author__ = 'logophile777'
-    remove_tags = [
-        {'class': ['emp-alt-handheld', 'emp-noflash',
-                   'emp-flashlink', 'emp-alt-screen']}
-    ]
-
-    feeds = [(u'BBC Arabic Middle East',
-              u'http://www.bbc.co.uk/arabic/middleeast/index.xml')]
-
-    def print_version(self, url):
-        return url + '?print=1'

From 7ed9d11811a240c3336f765c8134e2ad2caee3cf Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 09:56:58 +0200
Subject: [PATCH 03/20] Delete dnevni_avaz.recipe

---
 recipes/dnevni_avaz.recipe | 76 --------------------------------------
 1 file changed, 76 deletions(-)
 delete mode 100644 recipes/dnevni_avaz.recipe

diff --git a/recipes/dnevni_avaz.recipe b/recipes/dnevni_avaz.recipe
deleted file mode 100644
index 45916f70a7..0000000000
--- a/recipes/dnevni_avaz.recipe
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/env  python2
-
-__license__ = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
-
-'''
-dnevniavaz.ba
-'''
-
-import re
-from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag
-
-
-def new_tag(soup, name, attrs=()):
-    impl = getattr(soup, 'new_tag', None)
-    if impl is not None:
-        return impl(name, attrs=dict(attrs))
-    return Tag(soup, name, attrs=attrs or None)
-
-
-class DnevniAvaz(BasicNewsRecipe):
-    title = 'Dnevni Avaz'
-    __author__ = 'Darko Miletic'
-    description = 'Latest news from Bosnia'
-    publisher = 'Dnevni Avaz'
-    category = 'news, politics, Bosnia and Herzegovina'
-    oldest_article = 2
-    max_articles_per_feed = 100
-    no_stylesheets = True
-    encoding = 'utf-8'
-    use_embedded_content = False
-    remove_javascript = True
-    cover_url = 'http://www.dnevniavaz.ba/img/logo.gif'
-    lang = 'bs-BA'
-    language = 'bs'
-
-    direction = 'ltr'
-
-    extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'  # noqa
-
-    conversion_options = {
-        'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
-    }
-
-    preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
-
-    keep_only_tags = [dict(name='div', attrs={'id': [
-                           'fullarticle-title', 'fullarticle-leading', 'fullarticle-date', 'fullarticle-text', 'articleauthor']})]
-
-    remove_tags = [dict(name=['object', 'link', 'base'])]
-
-    feeds = [
-        (u'Najnovije', u'http://www.dnevniavaz.ba/rss/novo'), (u'Najpopularnije',
-                                                               u'http://www.dnevniavaz.ba/rss/popularno')
-    ]
-
-    def replace_tagname(self, soup, tagname, tagid, newtagname):
-        headtag = soup.find(tagname, attrs={'id': tagid})
-        if headtag:
-            headtag.name = newtagname
-        return
-
-    def preprocess_html(self, soup):
-        soup.html['xml:lang'] = self.lang
-        soup.html['lang'] = self.lang
-        mlang = new_tag(soup, 'meta', [
-                    ("http-equiv", "Content-Language"), ("content", self.lang)])
-        mcharset = new_tag(soup, 'meta', [
-                       ("http-equiv", "Content-Type"), ("content", "text/html; charset=UTF-8")])
-        soup.head.insert(0, mlang)
-        soup.head.insert(1, mcharset)
-        self.replace_tagname(soup, 'div', 'fullarticle-title', 'h1')
-        self.replace_tagname(soup, 'div', 'fullarticle-leading', 'h3')
-        self.replace_tagname(soup, 'div', 'fullarticle-date', 'h5')
-        return self.adeify_images(soup)

From a2b03714e8379475f0c38bfe3f94d1b5d4d80086 Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 09:59:35 +0200
Subject: [PATCH 04/20] Delete blog_da_cidadania.recipe

---
 recipes/blog_da_cidadania.recipe | 20 --------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 recipes/blog_da_cidadania.recipe

diff --git a/recipes/blog_da_cidadania.recipe b/recipes/blog_da_cidadania.recipe
deleted file mode 100644
index b94ad18625..0000000000
--- a/recipes/blog_da_cidadania.recipe
+++ /dev/null
@@ -1,20 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class BlogdaCidadania(BasicNewsRecipe):
-    title = 'Blog da Cidadania'
-    __author__ = 'Diniz Bortolotto'
-    description = 'Posts do Blog da Cidadania'
-    oldest_article = 7
-    max_articles_per_feed = 50
-    encoding = 'utf8'
-    publisher = 'Eduardo Guimaraes'
-    category = 'politics, Brazil'
-    language = 'pt_BR'
-    publication_type = 'politics portal'
-
-    feeds = [(u'Blog da Cidadania', u'http://www.blogcidadania.com.br/feed/')]
-
-    reverse_article_order = True

From 897df23ab93718911aeb4bc30c381f5618082ebc Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 10:01:06 +0200
Subject: [PATCH 05/20] Delete escrevinhador.recipe

---
 recipes/escrevinhador.recipe | 28 ----------------------------
 1 file changed, 28 deletions(-)
 delete mode 100644 recipes/escrevinhador.recipe

diff --git a/recipes/escrevinhador.recipe b/recipes/escrevinhador.recipe
deleted file mode 100644
index 3011e88bbe..0000000000
--- a/recipes/escrevinhador.recipe
+++ /dev/null
@@ -1,28 +0,0 @@
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class Escrevinhador(BasicNewsRecipe):
-    title = 'Blog Escrevinhador'
-    __author__ = 'Diniz Bortolotto'
-    description = 'Posts do Blog Escrevinhador'
-    publisher = 'Rodrigo Viana'
-    oldest_article = 5
-    max_articles_per_feed = 20
-    category = 'news, politics, Brazil'
-    language = 'pt_BR'
-    publication_type = 'news and politics portal'
-    use_embedded_content = False
-    no_stylesheets = True
-    remove_javascript = True
-
-    feeds = [(u'Blog Escrevinhador', u'http://www.rodrigovianna.com.br/feed')]
-
-    reverse_article_order = True
-
-    remove_tags_after = [dict(name='div', attrs={'class': 'text'})]
-
-    remove_tags = [
-        dict(id='header'),
-        dict(name='p', attrs={'class': 'tags'}),
-        dict(name='div', attrs={'class': 'sociable'})
-    ]

From daaf4c4b550199db8aad422c4fc23c7124934be2 Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:29:47 +0200
Subject: [PATCH 06/20] Delete carta_capital.recipe

---
 recipes/carta_capital.recipe | 29 -----------------------------
 1 file changed, 29 deletions(-)
 delete mode 100644 recipes/carta_capital.recipe

diff --git a/recipes/carta_capital.recipe b/recipes/carta_capital.recipe
deleted file mode 100644
index 9d189bab07..0000000000
--- a/recipes/carta_capital.recipe
+++ /dev/null
@@ -1,29 +0,0 @@
-# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class AdvancedUserRecipe1380852962(BasicNewsRecipe):
-    title = u'Carta Capital'
-    __author__ = 'Erico Lisboa'
-    language = 'pt_BR'
-    oldest_article = 15
-    max_articles_per_feed = 100
-    auto_cleanup = True
-    use_embedded_content = False
-
-    feeds = [(u'Pol\xedtica',
-              u'http://www.cartacapital.com.br/politica/politica/rss'), (u'Economia',
-                                                                         u'http://www.cartacapital.com.br/economia/economia/atom.xml'),
-             (u'Sociedade',
-              u'http://www.cartacapital.com.br/sociedade/sociedade/atom.xml'),
-             (u'Internacional',
-              u'http://www.cartacapital.com.br/internacional/internacional/atom.xml'),
-             (u'Tecnologia',
-              u'http://www.cartacapital.com.br/tecnologia/tecnologia/atom.xml'),
-             (u'Cultura',
-              u'http://www.cartacapital.com.br/cultura/cultura/atom.xml'),
-             (u'Sa\xfade', u'http://www.cartacapital.com.br/saude/saude/atom.xml'),
-             (u'Educa\xe7\xe3o',
-              u'http://www.cartacapital.com.br/educacao/educacao/atom.xml')]

From faf65ab82eced9b5e88c11d8c82d6ea637c78772 Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:30:16 +0200
Subject: [PATCH 07/20] Delete caros_amigos.recipe

---
 recipes/caros_amigos.recipe | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 recipes/caros_amigos.recipe

diff --git a/recipes/caros_amigos.recipe b/recipes/caros_amigos.recipe
deleted file mode 100644
index b1d5b2cb8e..0000000000
--- a/recipes/caros_amigos.recipe
+++ /dev/null
@@ -1,16 +0,0 @@
-__copyright__ = '2011, Pablo Aldama <pabloaldama at gmail.com>'
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class AdvancedUserRecipe1311839910(BasicNewsRecipe):
-    title = u'Caros Amigos'
-    oldest_article = 20
-    max_articles_per_feed = 100
-    language = 'pt_BR'
-    __author__ = 'Pablo Aldama'
-
-    feeds = [(u'Caros Amigos',
-              u'http://carosamigos.terra.com.br/index2/index.php?format=feed&type=rss')]
-    keep_only_tags = [dict(name='div', attrs={'class': ['blog']}), dict(name='div', attrs={'class': ['blogcontent']})
-                      ]
-    remove_tags = [dict(name='div', attrs={'class': 'addtoany'})]

From a193acc8262dd64a50c7ac36920351d916c187e0 Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:30:42 +0200
Subject: [PATCH 08/20] Delete idg_now.recipe

---
 recipes/idg_now.recipe | 49 ------------------------------------------
 1 file changed, 49 deletions(-)
 delete mode 100644 recipes/idg_now.recipe

diff --git a/recipes/idg_now.recipe b/recipes/idg_now.recipe
deleted file mode 100644
index 74657d5ab6..0000000000
--- a/recipes/idg_now.recipe
+++ /dev/null
@@ -1,49 +0,0 @@
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class IDGNow(BasicNewsRecipe):
-    title = 'IDG Now!'
-    __author__ = 'Diniz Bortolotto'
-    description = 'Posts do IDG Now!'
-    oldest_article = 7
-    max_articles_per_feed = 20
-    encoding = 'utf8'
-    publisher = 'Now!Digital Business Ltda.'
-    category = 'technology, telecom, IT, Brazil'
-    language = 'pt_BR'
-    publication_type = 'technology portal'
-    use_embedded_content = False
-    extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt  }'
-
-    def get_article_url(self, article):
-        link = article.get('link', None)
-        if link is None:
-            return article
-        if link.split('/')[-1] == "story01.htm":
-            link = link.split('/')[-2]
-            a = ['0B', '0C', '0D', '0E', '0F', '0G',
-                 '0I', '0N', '0L0S', '0A', '0J3A']
-            b = ['.', '/', '?', '-', '=', '&', '_', '.com', 'www.', '0', ':']
-            for i in range(0, len(a)):
-                link = link.replace(a[i], b[i])
-            link = link.split('&')[-3]
-            link = link.split('=')[1]
-            link = link + "/IDGNoticiaPrint_view"
-        return link
-
-    feeds = [
-        (u'Ultimas noticias', u'http://rss.idgnow.com.br/c/32184/f/499640/index.rss'),
-        (u'Computa\xe7\xe3o Corporativa',
-         u'http://rss.idgnow.com.br/c/32184/f/499643/index.rss'),
-        (u'Carreira', u'http://rss.idgnow.com.br/c/32184/f/499644/index.rss'),
-        (u'Computa\xe7\xe3o Pessoal',
-         u'http://rss.idgnow.com.br/c/32184/f/499645/index.rss'),
-        (u'Internet', u'http://rss.idgnow.com.br/c/32184/f/499646/index.rss'),
-        (u'Mercado', u'http://rss.idgnow.com.br/c/32184/f/419982/index.rss'),
-        (u'Seguran\xe7a',
-         u'http://rss.idgnow.com.br/c/32184/f/499647/index.rss'),
-        (u'Telecom e Redes',
-         u'http://rss.idgnow.com.br/c/32184/f/499648/index.rss')
-    ]
-
-    reverse_article_order = True

From c581643026afa38faf0c8325ba8f9e8966e0fc9e Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:31:12 +0200
Subject: [PATCH 09/20] Delete noticias_r7.recipe

---
 recipes/noticias_r7.recipe | 43 --------------------------------------
 1 file changed, 43 deletions(-)
 delete mode 100644 recipes/noticias_r7.recipe

diff --git a/recipes/noticias_r7.recipe b/recipes/noticias_r7.recipe
deleted file mode 100644
index 3bf88a87f1..0000000000
--- a/recipes/noticias_r7.recipe
+++ /dev/null
@@ -1,43 +0,0 @@
-import re
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class PortalR7(BasicNewsRecipe):
-    title = 'Noticias R7'
-    __author__ = 'Diniz Bortolotto'
-    description = 'Noticias Portal R7'
-    oldest_article = 2
-    max_articles_per_feed = 20
-    encoding = 'utf8'
-    publisher = 'Rede Record'
-    category = 'news, Brazil'
-    language = 'pt_BR'
-    publication_type = 'newsportal'
-    use_embedded_content = False
-    no_stylesheets = True
-    remove_javascript = True
-    remove_attributes = ['style']
-
-    feeds = [
-        (u'Brasil', u'http://www.r7.com/data/rss/brasil.xml'),
-        (u'Economia', u'http://www.r7.com/data/rss/economia.xml'),
-        (u'Internacional',
-         u'http://www.r7.com/data/rss/internacional.xml'),
-        (u'Tecnologia e Ci\xeancia',
-         u'http://www.r7.com/data/rss/tecnologiaCiencia.xml')
-    ]
-    reverse_article_order = True
-
-    keep_only_tags = [dict(name='div', attrs={'class': 'materia'})]
-    remove_tags = [
-        dict(id=['espalhe', 'report-erro']),
-        dict(name='ul', attrs={'class': 'controles'}),
-        dict(name='ul', attrs={'class': 'relacionados'}),
-        dict(name='div', attrs={'class': 'materia_banner'}),
-        dict(name='div', attrs={'class': 'materia_controles'})
-    ]
-
-    preprocess_regexps = [
-        (re.compile(r'<div class="materia">.*<div class="materia_cabecalho">', re.DOTALL | re.IGNORECASE),
-         lambda match: '<div class="materia"><div class="materia_cabecalho">')
-    ]

From 578c05905d0bb5716111965cf72997ba63088d75 Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:31:35 +0200
Subject: [PATCH 10/20] Delete noticias_unb.recipe

---
 recipes/noticias_unb.recipe | 24 ------------------------
 1 file changed, 24 deletions(-)
 delete mode 100644 recipes/noticias_unb.recipe

diff --git a/recipes/noticias_unb.recipe b/recipes/noticias_unb.recipe
deleted file mode 100644
index a38e8648d8..0000000000
--- a/recipes/noticias_unb.recipe
+++ /dev/null
@@ -1,24 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class NoticiasUnB(BasicNewsRecipe):
-    title = 'Noticias UnB'
-    __author__ = 'Diniz Bortolotto'
-    description = 'Noticias da UnB'
-    oldest_article = 5
-    max_articles_per_feed = 20
-    category = 'news, educational, Brazil'
-    language = 'pt_BR'
-    publication_type = 'newsportal'
-    use_embedded_content = False
-    no_stylesheets = True
-    remove_javascript = True
-
-    feeds = [(u'UnB Agência', u'http://www.unb.br/noticias/rss/noticias.rss')]
-
-    reverse_article_order = True
-
-    def print_version(self, url):
-        return url.replace('http://', 'http://www.unb.br/noticias/print_email/imprimir.php?u=http://')

From 2a8b9133de053ceb54c455050a41f98658a4955e Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:39:34 +0200
Subject: [PATCH 11/20] Delete ming_pao_toronto.recipe

---
 recipes/ming_pao_toronto.recipe | 1018 -------------------------------
 1 file changed, 1018 deletions(-)
 delete mode 100644 recipes/ming_pao_toronto.recipe

diff --git a/recipes/ming_pao_toronto.recipe b/recipes/ming_pao_toronto.recipe
deleted file mode 100644
index 24792ae76a..0000000000
--- a/recipes/ming_pao_toronto.recipe
+++ /dev/null
@@ -1,1018 +0,0 @@
-from __future__ import print_function
-__license__ = 'GPL v3'
-__copyright__ = '2010-2011, Eddie Lau'
-
-# Region - Hong Kong, Vancouver, Toronto
-__Region__ = 'Toronto'
-# Users of Kindle 3 with limited system-level CJK support
-# please replace the following "True" with "False". (Default: True)
-__MakePeriodical__ = True
-# Turn below to True if your device supports display of CJK titles
-# (Default: False)
-__UseChineseTitle__ = False
-# Set it to False if you want to skip images (Default: True)
-__KeepImages__ = True
-# Set it to True if you want to include a summary in Kindle's article view
-# (Default: False)
-__IncludeSummary__ = False
-# Set it to True if you want thumbnail images in Kindle's article view
-# (Default: True)
-__IncludeThumbnails__ = True
-# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
-__UseLife__ = True
-# (HK only) It is to disable premium content (Default: False)
-__InclPremium__ = False
-# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
-__ParsePFF__ = True
-# (HK only) Turn below to True if you wish hi-res images (Default: False)
-__HiResImg__ = False
-# Override the date returned by the program if specifying a YYYYMMDD below
-__Date__ = ''
-
-
-'''
-Change Log:
-2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
-            from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
-            download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
-2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
-2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
-2011/10/19: fix a bug in txt source parsing
-2011/10/17: disable fetching of premium content, also improved txt source parsing
-2011/10/04: option to get hi-res photos for the articles
-2011/09/21: fetching "column" section is made optional.
-2011/09/18: parse "column" section stuff from source text file directly.
-2011/09/07: disable "column" section as it is no longer offered free.
-2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
-            provide options to remove all images in the file
-2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
-2011/03/06: add new articles for finance section, also a new section "Columns"
-2011/02/28: rearrange the sections
-            [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
-            View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues"
-            folder in Kindle 3
-2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
-            clean up the indentation
-2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
-            (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
-2010/11/22: add English section, remove eco-news section which is not updated daily, correct
-            ordering of articles
-2010/11/12: add news image and eco-news section
-2010/11/08: add parsing of finance section
-2010/11/06: temporary work-around for Kindle device having no capability to display unicode
-            in section/article list.
-2010/10/31: skip repeated articles in section pages
-'''
-
-from calibre.utils.date import now as nowf
-import os
-import datetime
-import re
-import mechanize
-from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
-from calibre.ebooks.metadata.opf2 import OPFCreator
-from calibre.ebooks.metadata.toc import TOC
-from calibre.ebooks.metadata import MetaInformation
-from calibre.utils.localization import canonicalize_lang
-
-# MAIN CLASS
-
-
-class MPRecipe(BasicNewsRecipe):
-    if __Region__ == 'Hong Kong':
-        if __UseChineseTitle__ is True:
-            title = u'\u660e\u5831 (\u9999\u6e2f)'
-        else:
-            title = 'Ming Pao - Hong Kong'
-        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
-        category = 'Chinese, News, Hong Kong'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'  # noqa
-        masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
-        keep_only_tags = [dict(name='h1'),
-                          # for entertainment page title
-                          dict(name='font', attrs={
-                               'style': ['font-size:14pt; line-height:160%;']}),
-                          # for column articles title
-                          dict(name='font', attrs={'color': ['AA0000']}),
-                          # for heading from txt
-                          dict(attrs={'class': ['heading']}),
-                          # entertainment and column page content
-                          dict(attrs={'id': ['newscontent']}),
-                          dict(
-                              attrs={'id': ['newscontent01', 'newscontent02']}),
-                          # for content from txt
-                          dict(attrs={'class': ['content']}),
-                          dict(attrs={'class': ['photo']}),
-                          dict(name='table', attrs={'width': ['100%'], 'border':['0'], 'cellspacing':[
-                               '5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
-                          # images for source from life.mingpao.com
-                          dict(name='img', attrs={
-                               'width': ['180'], 'alt':['????']}),
-                          # for images from txt
-                          dict(attrs={'class': ['images']})
-                          ]
-        if __KeepImages__:
-            remove_tags = [dict(name='style'),
-                           # for the finance page from mpfinance.com
-                           dict(attrs={'id': ['newscontent135']}),
-                           # article date in life.mingpao.com article
-                           dict(name='font', attrs={
-                                'size': ['2'], 'color':['666666']}),
-                           # dict(name='table')  # for content fetched from
-                           # life.mingpao.com
-                           ]
-        else:
-            remove_tags = [dict(name='style'),
-                           # for the finance page from mpfinance.com
-                           dict(attrs={'id': ['newscontent135']}),
-                           # article date in life.mingpao.com article
-                           dict(name='font', attrs={
-                                'size': ['2'], 'color':['666666']}),
-                           dict(name='img'),
-                           # dict(name='table')  # for content fetched from
-                           # life.mingpao.com
-                           ]
-        remove_attributes = ['width']
-        preprocess_regexps = [
-            (re.compile(r'<h5>', re.DOTALL | re.IGNORECASE),
-             lambda match: '<h1>'),
-            (re.compile(r'</h5>', re.DOTALL | re.IGNORECASE),
-             lambda match: '</h1>'),
-            (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL | re.IGNORECASE),  # for entertainment page
-             lambda match: ''),
-            # skip <br> after title in life.mingpao.com fetched article
-            (re.compile(r"<div id='newscontent'><br>", re.DOTALL | re.IGNORECASE),
-             lambda match: "<div id='newscontent'>"),
-            (re.compile(r"<br><br></b>", re.DOTALL | re.IGNORECASE),
-             lambda match: "</b>")
-        ]
-    elif __Region__ == 'Vancouver':
-        if __UseChineseTitle__ is True:
-            title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
-        else:
-            title = 'Ming Pao - Vancouver'
-        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
-        category = 'Chinese, News, Vancouver'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'  # noqa
-        masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
-        keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
-                          dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[
-                               '3'], 'cellpadding':['3'], 'id':['tblContent3']}),
-                          dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[
-                               '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
-                          ]
-        if __KeepImages__:
-            # the magnifier icon
-            remove_tags = [
-                dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})]
-        else:
-            remove_tags = [dict(name='img')]
-        remove_attributes = ['width']
-        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL | re.IGNORECASE),
-                               lambda match: ''),
-                              ]
-    elif __Region__ == 'Toronto':
-        if __UseChineseTitle__ is True:
-            title = u'\u660e\u5831 (\u591a\u502b\u591a)'
-        else:
-            title = 'Ming Pao - Toronto'
-        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
-        category = 'Chinese, News, Toronto'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'  # noqa
-        masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
-        keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
-                          dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[
-                               '3'], 'cellpadding':['3'], 'id':['tblContent3']}),
-                          dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[
-                               '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
-                          ]
-        if __KeepImages__:
-            # the magnifier icon
-            remove_tags = [
-                dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})]
-        else:
-            remove_tags = [dict(name='img')]
-        remove_attributes = ['width']
-        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL | re.IGNORECASE),
-                               lambda match: ''),
-                              ]
-
-    oldest_article = 1
-    max_articles_per_feed = 100
-    __author__ = 'Eddie Lau'
-    publisher = 'MingPao'
-    remove_javascript = True
-    use_embedded_content = False
-    no_stylesheets = True
-    language = 'zh'
-    encoding = 'Big5-HKSCS'
-    recursions = 0
-    conversion_options = {'linearize_tables': True}
-    timefmt = ''
-
-    def get_dtlocal(self):
-        dt_utc = datetime.datetime.utcnow()
-        if __Region__ == 'Hong Kong':
-            # convert UTC to local hk time - at HKT 4.30am, all news are
-            # available
-            dt_local = dt_utc + \
-                datetime.timedelta(8.0 / 24) - datetime.timedelta(4.5 / 24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
-        elif __Region__ == 'Vancouver':
-            # convert UTC to local Vancouver time - at PST time 5.30am, all
-            # news are available
-            dt_local = dt_utc + \
-                datetime.timedelta(-8.0 / 24) - datetime.timedelta(5.5 / 24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(5.5/24)
-        elif __Region__ == 'Toronto':
-            # convert UTC to local Toronto time - at EST time 8.30am, all news
-            # are available
-            dt_local = dt_utc + \
-                datetime.timedelta(-5.0 / 24) - datetime.timedelta(8.5 / 24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(8.5/24)
-        return dt_local
-
-    def get_fetchdate(self):
-        if __Date__ != '':
-            return __Date__
-        else:
-            return self.get_dtlocal().strftime("%Y%m%d")
-
-    def get_fetchformatteddate(self):
-        if __Date__ != '':
-            return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8]
-        else:
-            return self.get_dtlocal().strftime("%Y-%m-%d")
-
-    def get_fetchyear(self):
-        if __Date__ != '':
-            return __Date__[0:4]
-        else:
-            return self.get_dtlocal().strftime("%Y")
-
-    def get_fetchmonth(self):
-        if __Date__ != '':
-            return __Date__[4:6]
-        else:
-            return self.get_dtlocal().strftime("%m")
-
-    def get_fetchday(self):
-        if __Date__ != '':
-            return __Date__[6:8]
-        else:
-            return self.get_dtlocal().strftime("%d")
-
-    def get_cover_url(self):
-        if __Region__ == 'Hong Kong':
-            cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + \
-                '_' + self.get_fetchday() + 'gacov.jpg'
-        elif __Region__ == 'Vancouver':
-            cover = 'http://www.mingpaovan.com/ftp/News/' + \
-                self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
-        elif __Region__ == 'Toronto':
-            cover = 'http://www.mingpaotor.com/ftp/News/' + \
-                self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
-        br = BasicNewsRecipe.get_browser(self)
-        try:
-            br.open(cover)
-        except:
-            cover = None
-        return cover
-
-    def parse_index(self):
-        feeds = []
-        dateStr = self.get_fetchdate()
-
-        if __Region__ == 'Hong Kong':
-            if __UseLife__:
-                for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
-                                           (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalgb', 'nal'),
-                                           (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalgf', 'nal'),
-                                           (u'\u793e\u8a55/\u7b46\u9663 Editorial',
-                                            'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalmr', 'nal'),
-                                           (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalfa', 'nal'),
-                                           (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalca', 'nal'),
-                                           (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalta', 'nal'),
-                                           (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalea', 'nal'),
-                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalsp', 'nal'),
-                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalma', 'nal')
-                                           ]:
-                    if __InclPremium__ is True:
-                        articles = self.parse_section2_txt(url, keystr)
-                    else:
-                        articles = self.parse_section2(url, keystr)
-                    if articles:
-                        feeds.append((title, articles))
-
-                if __InclPremium__ is True:
-                    # parse column section articles directly from .txt files
-                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl')  # noqa
-                                               ]:
-                        articles = self.parse_section2_txt(url, keystr)
-                        if articles:
-                            feeds.append((title, articles))
-
-                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
-                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
-                    articles = self.parse_section(url)
-                    if articles:
-                        feeds.append((title, articles))
-            else:
-                for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
-                                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' +
-                                    dateStr + '/gbindex.htm'),
-                                   (u'\u6559\u80b2 Education',
-                                    'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
-                                   (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
-                    articles = self.parse_section(url)
-                    if articles:
-                        feeds.append((title, articles))
-
-                # special- editorial
-                # ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
-                # if ed_articles:
-                #    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
-
-                for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
-                                   (u'\u4e2d\u570b China', 'http://news.mingpao.com/' +
-                                    dateStr + '/caindex.htm'),
-                                   (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
-                    articles = self.parse_section(url)
-                    if articles:
-                        feeds.append((title, articles))
-
-                # special - finance
-                # fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
-                # fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
-                # if fin_articles:
-                #    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
-
-                for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
-                    articles = self.parse_section2_txt(url, keystr)
-                    if articles:
-                        feeds.append((title, articles))
-
-                # for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
-                #                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
-                #    articles = self.parse_section(url)
-                #    if articles:
-                #        feeds.append((title, articles))
-
-                # special - entertainment
-                # ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
-                # if ent_articles:
-                #    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
-
-                for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
-                                           ]:
-                    articles = self.parse_section2_txt(url, keystr)
-                    if articles:
-                        feeds.append((title, articles))
-
-                if __InclPremium__ is True:
-                    # parse column section articles directly from .txt files
-                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl')  # noqa
-                                               ]:
-                        articles = self.parse_section2_txt(url, keystr)
-                        if articles:
-                            feeds.append((title, articles))
-
-                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
-                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
-                    articles = self.parse_section(url)
-                    if articles:
-                        feeds.append((title, articles))
-
-        elif __Region__ == 'Vancouver':
-            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
-                               (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VBindex.htm'),
-                               (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VDindex.htm'),
-                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/HK-VGindex.htm'),
-                               (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VTindex.htm'),
-                               (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VCindex.htm'),
-                               (u'\u7d93\u6fdf Economics',
-                                'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
-                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VSindex.htm'),
-                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/HK-MAindex.htm'),
-                               (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'), ]:
-                articles = self.parse_section3(
-                    url, 'http://www.mingpaovan.com/')
-                if articles:
-                    feeds.append((title, articles))
-        elif __Region__ == 'Toronto':
-            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
-                               (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TDindex.htm'),
-                               (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TFindex.htm'),
-                               (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TCAindex.htm'),
-                               (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TTAindex.htm'),
-                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/HK-GAindex.htm'),
-                               (u'\u7d93\u6fdf Economics',
-                                'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
-                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TSindex.htm'),
-                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/HK-MAindex.htm'),
-                               (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'), ]:
-                articles = self.parse_section3(
-                    url, 'http://www.mingpaotor.com/')
-                if articles:
-                    feeds.append((title, articles))
-        return feeds
-
-    # parse from news.mingpao.com
-    def parse_section(self, url):
-        dateStr = self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        divs = soup.findAll(attrs={'class': ['bullet', 'bullet_grey']})
-        current_articles = []
-        included_urls = []
-        divs.reverse()
-        for i in divs:
-            a = i.find('a', href=True)
-            title = self.tag_to_string(a)
-            url = a.get('href', False)
-            url = 'http://news.mingpao.com/' + dateStr + '/' + url
-            # replace the url to the print-friendly version
-            if __ParsePFF__ is True:
-                if url.rfind('Redirect') != -1 and __InclPremium__ is True:
-                    url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
-                    url = re.sub('%2F.*%2F', '/', url)
-                    title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
-                    url = url.replace('%2Etxt', '_print.htm')
-                    url = url.replace('%5F', '_')
-                else:
-                    url = url.replace('.htm', '_print.htm')
-            if url not in included_urls and url.rfind('Redirect') == -1:
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': '', 'date': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    # parse from life.mingpao.com
-    def parse_section2(self, url, keystr):
-        br = mechanize.Browser()
-        br.set_handle_redirect(False)
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
-                try:
-                    br.open_novisit(url)
-                    # use printed version of the article
-                    url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')
-                    current_articles.append(
-                        {'title': title, 'url': url, 'description': ''})
-                    included_urls.append(url)
-                except:
-                    print('skipping a premium article')
-        current_articles.reverse()
-        return current_articles
-
-    # parse from text file of life.mingpao.com
-    def parse_section2_txt(self, url, keystr):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
-                # use printed version of the article
-                url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/')
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    # parse from www.mingpaovan.com
-    def parse_section3(self, url, baseUrl):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
-        current_articles = []
-        included_urls = []
-        divs.reverse()
-        for i in divs:
-            title = self.tag_to_string(i)
-            urlstr = i.get('href', False)
-            urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
-            if urlstr not in included_urls:
-                current_articles.append(
-                    {'title': title, 'url': urlstr, 'description': '', 'date': ''})
-                included_urls.append(urlstr)
-        current_articles.reverse()
-        return current_articles
-
-    def parse_ed_section(self, url):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    def parse_fin_section(self, url):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        current_articles = []
-        included_urls = []
-        for i in a:
-            # url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            # if url not in included_urls and not url.rfind(dateStr) == -1 and
-            # url.rfind('index') == -1:
-            if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
-                title = self.tag_to_string(i)
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        return current_articles
-
-    def parse_ent_section(self, url):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    def parse_col_section(self, url):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    # preprocess those .txt and javascript based files
-    def preprocess_raw_html(self, raw_html, url):
-        new_html = raw_html
-        if url.rfind('ftp') != -1 or url.rfind('_print.htm') != -1:
-            if url.rfind('_print.htm') != -1:
-                # javascript based file
-                splitter = re.compile(r'\n')
-                new_raw_html = '<html><head><title>Untitled</title></head>'
-                new_raw_html = new_raw_html + '<body>'
-                for item in splitter.split(raw_html):
-                    if item.startswith('var heading1 ='):
-                        heading = item.replace('var heading1 = \'', '')
-                        heading = heading.replace('\'', '')
-                        heading = heading.replace(';', '')
-                        new_raw_html = new_raw_html + '<div class="heading">' + heading
-                    if item.startswith('var heading2 ='):
-                        heading = item.replace('var heading2 = \'', '')
-                        heading = heading.replace('\'', '')
-                        heading = heading.replace(';', '')
-                        if heading != '':
-                            new_raw_html = new_raw_html + '<br>' + heading + '</div>'
-                        else:
-                            new_raw_html = new_raw_html + '</div>'
-                    if item.startswith('var content ='):
-                        content = item.replace("var content = ", '')
-                        content = content.replace('\'', '')
-                        content = content.replace(';', '')
-                        new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
-                    if item.startswith('var photocontent ='):
-                        photo = item.replace('var photocontent = \'', '')
-                        photo = photo.replace('\'', '')
-                        photo = photo.replace(';', '')
-                        photo = photo.replace('<tr>', '')
-                        photo = photo.replace('<td>', '')
-                        photo = photo.replace('</tr>', '')
-                        photo = photo.replace('</td>', '<br>')
-                        photo = photo.replace('class="photo"', '')
-                        new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
-                new_html = new_raw_html + '</body></html>'
-            else:
-                # .txt based file
-                splitter = re.compile(r'\n')  # Match non-digits
-                new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
-                next_is_img_txt = False
-                title_started = False
-                title_break_reached = False
-                met_article_start_char = False
-                for item in splitter.split(raw_html):
-                    item = item.strip()
-                    # if title already reached but break between title and
-                    # content not yet found, record title_break_reached
-                    if title_started is True and title_break_reached is False and item == '':
-                        title_break_reached = True
-                    # if title reached and title_break_reached and met_article_start_char is False and item is not empty
-                    # start content
-                    elif title_started is True and title_break_reached is True and met_article_start_char is False:
-                        if item != '':
-                            met_article_start_char = True
-                            new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
-                    # if item.startswith(u'\u3010'):
-                    #    met_article_start_char = True
-                    #    new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
-                    else:
-                        if next_is_img_txt is False:
-                            if item.startswith("=@"):
-                                print('skip movie link')
-                            elif item.startswith("=?"):
-                                next_is_img_txt = True
-                                new_raw_html += '<img src="' + \
-                                    str(item)[2:].strip() + '.gif" /><p>\n'
-                            elif item.startswith('=='):
-                                next_is_img_txt = True
-                                if False:
-                                    # TODO: check existence of .gif first
-                                    newimg = '_' + item[2:].strip() + '.jpg'
-                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
-                                else:
-                                    new_raw_html += '<img src="' + \
-                                        str(item)[2:].strip() + '.jpg" /><p>\n'
-                            elif item.startswith('='):
-                                next_is_img_txt = True
-                                if False:
-                                    # TODO: check existence of .gif first
-                                    newimg = '_' + item[1:].strip() + '.jpg'
-                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
-                                else:
-                                    new_raw_html += '<img src="' + \
-                                        str(item)[1:].strip() + '.jpg" /><p>\n'
-                            else:
-                                if next_is_img_txt is False and met_article_start_char is False:
-                                    if item != '':
-                                        if title_started is False:
-                                            # print 'Title started at ', item
-                                            new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
-                                            title_started = True
-                                        else:
-                                            new_raw_html = new_raw_html + item + '\n'
-                                else:
-                                    new_raw_html = new_raw_html + item + '<p>\n'
-                        else:
-                            next_is_img_txt = False
-                            new_raw_html = new_raw_html + item + '\n'
-                new_html = new_raw_html + '</div></body></html>'
-        if __HiResImg__ is True:
-            # TODO: add a _ in front of an image url
-            if url.rfind('news.mingpao.com') > -1:
-                imglist = re.findall('src="?.*?jpg"', new_html)
-                br = mechanize.Browser()
-                br.set_handle_redirect(False)
-                for img in imglist:
-                    gifimg = img.replace('jpg"', 'gif"')
-                    try:
-                        br.open_novisit(
-                            url + "/../" + gifimg[5:len(gifimg) - 1])
-                        new_html = new_html.replace(img, gifimg)
-                    except:
-                        # find the location of the first _
-                        pos = img.find('_')
-                        if pos > -1:
-                            # if found, insert _ after the first _
-                            newimg = img[0:pos] + '_' + img[pos:]
-                            new_html = new_html.replace(img, newimg)
-                        else:
-                            # if not found, insert _ after "
-                            new_html = new_html.replace(
-                                img[1:], '"_' + img[1:])
-            elif url.rfind('life.mingpao.com') > -1:
-                imglist = re.findall('src=\'?.*?jpg\'', new_html)
-                br = mechanize.Browser()
-                br.set_handle_redirect(False)
-                # print 'Img list: ', imglist, '\n'
-                for img in imglist:
-                    # print 'Found img: ', img
-                    gifimg = img.replace('jpg\'', 'gif\'')
-                    try:
-                        gifurl = re.sub(r'dailynews.*txt', '', url)
-                        br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1])
-                        new_html = new_html.replace(img, gifimg)
-                    except:
-                        pos = img.rfind('/')
-                        newimg = img[0:pos + 1] + '_' + img[pos + 1:]
-                        new_html = new_html.replace(img, newimg)
-                # repeat with src quoted by double quotes, for text parsed from
-                # src txt
-                imglist = re.findall('src="?.*?jpg"', new_html)
-                for img in imglist:
-                    # print 'Found img: ', img
-                    gifimg = img.replace('jpg"', 'gif"')
-                    try:
-                        # print 'url', url
-                        pos = url.rfind('/')
-                        gifurl = url[:pos + 1]
-                        # print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
-                        br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1])
-                        new_html = new_html.replace(img, gifimg)
-                    except:
-                        pos = img.find('"')
-                        newimg = img[0:pos + 1] + '_' + img[pos + 1:]
-                        # print 'Use hi-res img', newimg
-                        new_html = new_html.replace(img, newimg)
-        return new_html
-
-    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']
-        for item in soup.findAll(style=True):
-            del item['width']
-        for item in soup.findAll(stype=True):
-            del item['absmiddle']
-        return soup
-
-    def populate_article_metadata(self, article, soup, first):
-        # thumbnails shouldn't be available if using hi-res images
-        if __IncludeThumbnails__ and __HiResImg__ is False and first and hasattr(self, 'add_toc_thumbnail'):
-            img = soup.find('img')
-            if img is not None:
-                self.add_toc_thumbnail(article, img['src'])
-
-        try:
-            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
-                # look for content
-                articlebodies = soup.findAll(
-                    'div', attrs={'id': 'newscontent'})
-                if not articlebodies:
-                    articlebodies = soup.findAll(
-                        'div', attrs={'id': 'newscontent01'})
-                if not articlebodies:
-                    articlebodies = soup.findAll(
-                        'div', attrs={'class': 'content'})
-                if not articlebodies:
-                    articlebodies = soup.findAll('div', attrs={'id': 'font'})
-                if articlebodies:
-                    for articlebody in articlebodies:
-                        if articlebody:
-                            # the text may or may not be enclosed in <p></p>
-                            # tag
-                            paras = articlebody.findAll('p')
-                            if not paras:
-                                paras = articlebody
-                            textFound = False
-                            for p in paras:
-                                if not textFound:
-                                    summary_candidate = self.tag_to_string(
-                                        p).strip()
-                                    summary_candidate = summary_candidate.replace(
-                                        u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
-                                    if len(summary_candidate) > 0:
-                                        article.summary = article.text_summary = summary_candidate
-                                        textFound = True
-            else:
-                # display a simple text
-                # article.summary = article.text_summary = u'\u66f4\u591a......'
-                # display word counts
-                counts = 0
-                articlebodies = soup.findAll(
-                    'div', attrs={'id': 'newscontent'})
-                if not articlebodies:
-                    articlebodies = soup.findAll(
-                        'div', attrs={'id': 'newscontent01'})
-                if not articlebodies:
-                    articlebodies = soup.findAll(
-                        'div', attrs={'class': 'content'})
-                if not articlebodies:
-                    articlebodies = soup.findAll('div', attrs={'id': 'font'})
-                if articlebodies:
-                    for articlebody in articlebodies:
-                        # the text may or may not be enclosed in <p></p> tag
-                        paras = articlebody.findAll('p')
-                        if not paras:
-                            paras = articlebody
-                        for p in paras:
-                            summary_candidate = self.tag_to_string(p).strip()
-                            counts += len(summary_candidate)
-                    article.summary = article.text_summary = u'\uff08' + \
-                        str(counts) + u'\u5b57\uff09'
-        except:
-            self.log("Error creating article descriptions")
-            return
-
-    # override from the one in version 0.8.31
-    def create_opf(self, feeds, dir=None):
-        if dir is None:
-            dir = self.output_dir
-        title = self.short_title()
-        # change 1: allow our own flag to tell if a periodical is to be generated
-        # also use customed date instead of current time
-        if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title:
-            title = title + ' ' + self.get_fetchformatteddate()
-        # end of change 1
-        # change 2: __appname__ replaced by newspaper publisher
-        __appname__ = self.publisher
-        mi = MetaInformation(title, [__appname__])
-        mi.publisher = __appname__
-        mi.author_sort = __appname__
-        # change 3: use __MakePeriodical__ flag to tell if a periodical should
-        # be generated
-        if __MakePeriodical__ is True:
-            mi.publication_type = 'periodical:' + \
-                self.publication_type + ':' + self.short_title()
-        else:
-            mi.publication_type = self.publication_type + ':' + self.short_title()
-        # mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
-        # change 4: in the following, all the nowf() are changed to adjusted time
-        # This one doesn't matter
-        mi.timestamp = nowf()
-        # change 5: skip listing the articles
-        # article_titles, aseen = [], set()
-        # for f in feeds:
-        #    for a in f:
-        #        if a.title and a.title not in aseen:
-        #            aseen.add(a.title)
-        #            article_titles.append(force_unicode(a.title, 'utf-8'))
-
-        # mi.comments = self.description
-        # if not isinstance(mi.comments, unicode):
-        #    mi.comments = mi.comments.decode('utf-8', 'replace')
-        # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
-        #        '\n\n'.join(article_titles))
-
-        language = canonicalize_lang(self.language)
-        if language is not None:
-            mi.language = language
-        # This one affects the pub date shown in kindle title
-        # mi.pubdate = nowf()
-        # now appears to need the time field to be > 12.00noon as well
-        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(
-            self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
-        opf_path = os.path.join(dir, 'index.opf')
-        ncx_path = os.path.join(dir, 'index.ncx')
-
-        opf = OPFCreator(dir, mi)
-        # Add mastheadImage entry to <guide> section
-        mp = getattr(self, 'masthead_path', None)
-        if mp is not None and os.access(mp, os.R_OK):
-            from calibre.ebooks.metadata.opf2 import Guide
-            ref = Guide.Reference(os.path.basename(
-                self.masthead_path), os.getcwd())
-            ref.type = 'masthead'
-            ref.title = 'Masthead Image'
-            opf.guide.append(ref)
-
-        manifest = [os.path.join(dir, 'feed_%d' % i)
-                    for i in range(len(feeds))]
-        manifest.append(os.path.join(dir, 'index.html'))
-        manifest.append(os.path.join(dir, 'index.ncx'))
-
-        # Get cover
-        cpath = getattr(self, 'cover_path', None)
-        if cpath is None:
-            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
-            if self.default_cover(pf):
-                cpath = pf.name
-        if cpath is not None and os.access(cpath, os.R_OK):
-            opf.cover = cpath
-            manifest.append(cpath)
-
-        # Get masthead
-        mpath = getattr(self, 'masthead_path', None)
-        if mpath is not None and os.access(mpath, os.R_OK):
-            manifest.append(mpath)
-
-        opf.create_manifest_from_files_in(manifest)
-        for mani in opf.manifest:
-            if mani.path.endswith('.ncx'):
-                mani.id = 'ncx'
-            if mani.path.endswith('mastheadImage.jpg'):
-                mani.id = 'masthead-image'
-
-        entries = ['index.html']
-        toc = TOC(base_path=dir)
-        self.play_order_counter = 0
-        self.play_order_map = {}
-
-        def feed_index(num, parent):
-            f = feeds[num]
-            for j, a in enumerate(f):
-                if getattr(a, 'downloaded', False):
-                    adir = 'feed_%d/article_%d/' % (num, j)
-                    auth = a.author
-                    if not auth:
-                        auth = None
-                    desc = a.text_summary
-                    if not desc:
-                        desc = None
-                    else:
-                        desc = self.description_limiter(desc)
-                    tt = a.toc_thumbnail if a.toc_thumbnail else None
-                    entries.append('%sindex.html' % adir)
-                    po = self.play_order_map.get(entries[-1], None)
-                    if po is None:
-                        self.play_order_counter += 1
-                        po = self.play_order_counter
-                    parent.add_item('%sindex.html' % adir, None,
-                                    a.title if a.title else _(
-                                        'Untitled Article'),
-                                    play_order=po, author=auth,
-                                    description=desc, toc_thumbnail=tt)
-                    last = os.path.join(
-                        self.output_dir, ('%sindex.html' % adir).replace('/', os.sep))
-                    for sp in a.sub_pages:
-                        prefix = os.path.commonprefix([opf_path, sp])
-                        relp = sp[len(prefix):]
-                        entries.append(relp.replace(os.sep, '/'))
-                        last = sp
-
-                    if os.path.exists(last):
-                        with open(last, 'rb') as fi:
-                            src = fi.read().decode('utf-8')
-                        soup = BeautifulSoup(src)
-                        body = soup.find('body')
-                        if body is not None:
-                            prefix = '/'.join('..'for i in range(2 *
-                                                                 len(re.findall(r'link\d+', last))))
-                            templ = self.navbar.generate(True, num, j, len(f),
-                                                         not self.has_single_feed,
-                                                         a.orig_url, __appname__, prefix=prefix,
-                                                         center=self.center_navbar)
-                            elem = BeautifulSoup(templ.render(
-                                doctype='xhtml').decode('utf-8')).find('div')
-                            body.insert(len(body.contents), elem)
-                            with open(last, 'wb') as fi:
-                                fi.write(type(u'')(soup).encode('utf-8'))
-        if len(feeds) == 0:
-            raise Exception('All feeds are empty, aborting.')
-
-        if len(feeds) > 1:
-            for i, f in enumerate(feeds):
-                entries.append('feed_%d/index.html' % i)
-                po = self.play_order_map.get(entries[-1], None)
-                if po is None:
-                    self.play_order_counter += 1
-                    po = self.play_order_counter
-                auth = getattr(f, 'author', None)
-                if not auth:
-                    auth = None
-                desc = getattr(f, 'description', None)
-                if not desc:
-                    desc = None
-                feed_index(i, toc.add_item('feed_%d/index.html' % i, None,
-                                           f.title, play_order=po, description=desc, author=auth))
-
-        else:
-            entries.append('feed_%d/index.html' % 0)
-            feed_index(0, toc)
-
-        for i, p in enumerate(entries):
-            entries[i] = os.path.join(dir, p.replace('/', os.sep))
-        opf.create_spine(entries)
-        opf.set_toc(toc)
-
-        with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file:
-            opf.render(opf_file, ncx_file)

From 903acbddc00ea7ebe9c1f65cb50155a10afa5083 Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:40:46 +0200
Subject: [PATCH 12/20] Delete ming_pao.recipe

---
 recipes/ming_pao.recipe | 1180 ---------------------------------------
 1 file changed, 1180 deletions(-)
 delete mode 100644 recipes/ming_pao.recipe

diff --git a/recipes/ming_pao.recipe b/recipes/ming_pao.recipe
deleted file mode 100644
index f77db49be9..0000000000
--- a/recipes/ming_pao.recipe
+++ /dev/null
@@ -1,1180 +0,0 @@
-from __future__ import print_function
-__license__ = 'GPL v3'
-__copyright__ = '2010-2013, Eddie Lau'
-
-# Region - Hong Kong, Vancouver, Toronto
-__Region__ = 'Hong Kong'
-# Users of Kindle 3 with limited system-level CJK support
-# please replace the following "True" with "False". (Default: True)
-__MakePeriodical__ = True
-# Turn below to True if your device supports display of CJK titles
-# (Default: False)
-__UseChineseTitle__ = False
-# Set it to False if you want to skip images (Default: True)
-__KeepImages__ = True
-# Set it to True if you want to include a summary in Kindle's article view
-# (Default: True)
-__IncludeSummary__ = True
-# Set it to True if you want thumbnail images in Kindle's article view
-# (Default: True)
-__IncludeThumbnails__ = True
-# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
-__UseLife__ = True
-# (HK only) It is to disable premium content (Default: False)
-__InclPremium__ = False
-# (HK only) Turn below to True if you wish to parse articles in news1.mingpao.com with their printer-friendly formats (Default: False)
-__ParsePF__ = False
-# (HK only) Turn below to True if you wish to parse articles in news1.mingpao.com with text formats (Default: True)  -- override __ParsePF__
-__ParseTxt__ = True
-# (HK only) Use mobile text version for some articles (Default: False)
-__ParseSelectedMobile__ = False
-# (HK only) Turn below to True if you wish hi-res images (Default: True)
-__HiResImg__ = True
-# Override the date returned by the program if specifying a YYYYMMDD below
-# (not work if __ParseSelectedMobile__ is True and __UseLife__ is False)
-__Date__ = ''
-
-
-'''
-Change Log:
-2014/10/19: update urls of some web location and top logo
-2013/09/28: allow thumbnails even with hi-res images
-2012/04/24: improved parsing of news1.mingpao.com content
-2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
-            from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
-            download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
-2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
-2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
-2011/10/19: fix a bug in txt source parsing
-2011/10/17: disable fetching of premium content, also improved txt source parsing
-2011/10/04: option to get hi-res photos for the articles
-2011/09/21: fetching "column" section is made optional.
-2011/09/18: parse "column" section stuff from source text file directly.
-2011/09/07: disable "column" section as it is no longer offered free.
-2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
-            provide options to remove all images in the file
-2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
-2011/03/06: add new articles for finance section, also a new section "Columns"
-2011/02/28: rearrange the sections
-            [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
-            View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues"
-            folder in Kindle 3
-2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
-            clean up the indentation
-2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
-            (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
-2010/11/22: add English section, remove eco-news section which is not updated daily, correct
-            ordering of articles
-2010/11/12: add news image and eco-news section
-2010/11/08: add parsing of finance section
-2010/11/06: temporary work-around for Kindle device having no capability to display unicode
-            in section/article list.
-2010/10/31: skip repeated articles in section pages
-'''
-
-from calibre.utils.date import now as nowf
-import os
-import datetime
-import re
-import mechanize
-from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
-from calibre.ebooks.metadata.opf2 import OPFCreator
-from calibre.ebooks.metadata.toc import TOC
-from calibre.ebooks.metadata import MetaInformation
-from calibre.utils.localization import canonicalize_lang
-
-# MAIN CLASS
-
-
-class MPRecipe(BasicNewsRecipe):
-    if __Region__ == 'Hong Kong':
-        if __UseChineseTitle__ is True:
-            title = u'\u660e\u5831 (\u9999\u6e2f)'
-        else:
-            title = 'Ming Pao - Hong Kong'
-        description = 'Hong Kong Chinese Newspaper (http://news1.mingpao.com)'
-        category = 'Chinese, News, Hong Kong'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'  # noqa
-        masthead_url = 'http://news.mingpao.com/image/mingpaonews_logo.png'
-        remove_tags_before = dict(name='font', attrs={'color': ['navy']})
-        keep_only_tags = [dict(name='h1'),
-                          # for entertainment page title
-                          dict(name='font', attrs={
-                               'style': ['font-size:14pt; line-height:160%;']}),
-                          # for column articles title
-                          dict(name='font', attrs={'color': ['AA0000']}),
-                          # for heading from txt
-                          dict(attrs={'class': ['heading']}),
-                          # entertainment and column page content
-                          dict(attrs={'id': ['newscontent']}),
-                          dict(
-                              attrs={'id': ['newscontent01', 'newscontent02']}),
-                          # for content from txt
-                          dict(attrs={'class': ['content']}),
-                          dict(attrs={'class': ['photo']}),
-                          dict(name='table', attrs={'width': ['100%'], 'border':['0'], 'cellspacing':[
-                               '5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
-                          # images for source from life.mingpao.com
-                          dict(name='img', attrs={
-                               'width': ['180'], 'alt':['????']}),
-                          # for images from txt
-                          dict(attrs={'class': ['images']}),
-                          dict(name='table', attrs={'width': ['100%'], 'cellspacing':[
-                               '0'], 'cellpadding':['0'], 'border':['0']})  # content table in pda site
-                          ]
-        if __KeepImages__:
-            remove_tags = [dict(name='style'),
-                           # for the finance page from mpfinance.com
-                           dict(attrs={'id': ['newscontent135']}),
-                           # article date in life.mingpao.com article
-                           dict(name='font', attrs={
-                                'size': ['2'], 'color':['666666']}),
-                           # non-article images in life.mingpao.com article
-                           dict(name='img', attrs={
-                                'alt': ["明報網站", "按此列印", "關閉本視窗"]}),
-                           dict(name='img', attrs={
-                                'src': ["../image/top_2.gif"]})
-                           # dict(name='table')  # for content fetched from life.mingpao.com
-                           # dict(name='table', attrs={'width':['98%'], 'cellspacing':['0'], 'cellpadding':['0'], 'border':['0']})
-                           ]
-        else:
-            remove_tags = [dict(name='style'),
-                           # for the finance page from mpfinance.com
-                           dict(attrs={'id': ['newscontent135']}),
-                           # article date in life.mingpao.com article
-                           dict(name='font', attrs={
-                                'size': ['2'], 'color':['666666']}),
-                           dict(name='img'),
-                           # dict(name='table')  # for content fetched from life.mingpao.com
-                           # dict(name='table', attrs={'width':['98%'], 'cellspacing':['0'], 'cellpadding':['0'], 'border':['0']})
-                           ]
-        remove_attributes = ['width']
-        preprocess_regexps = [
-            (re.compile(r'<h5>', re.DOTALL | re.IGNORECASE),
-             lambda match: '<h1>'),
-            (re.compile(r'</h5>', re.DOTALL | re.IGNORECASE),
-             lambda match: '</h1>'),
-            (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL | re.IGNORECASE),  # for entertainment page
-             lambda match: ''),
-            # skip <br> after title in life.mingpao.com fetched article
-            (re.compile(r"<div id='newscontent'><br>", re.DOTALL | re.IGNORECASE),
-             lambda match: "<div id='newscontent'>"),
-            (re.compile(r"<br><br></b>", re.DOTALL | re.IGNORECASE),
-             lambda match: "</b>"),
-            (re.compile(r'<br><br><img src="http://pda.mingpao.com/image/shim.gif" width=11><br>', re.DOTALL | re.IGNORECASE),
-             lambda match: ''),
-            (re.compile(r'<img src="http://pda.mingpao.com/image/mbup.gif" border=0>', re.DOTALL | re.IGNORECASE),
-             lambda match: ''),
-            (re.compile(r'<img src="http://pda.mingpao.com/image/mbun.gif" border=0>', re.DOTALL | re.IGNORECASE),
-             lambda match: ''),
-            # (re.compile(r'[<a href="HotNews1.cfm.+?">.+?</a>]', re.DOTALL|re.IGNORECASE),
-            # lambda match: '')
-        ]
-    elif __Region__ == 'Vancouver':
-        if __UseChineseTitle__ is True:
-            title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
-        else:
-            title = 'Ming Pao - Vancouver'
-        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
-        category = 'Chinese, News, Vancouver'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'  # noqa
-        masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
-        keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
-                          dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[
-                               '3'], 'cellpadding':['3'], 'id':['tblContent3']}),
-                          dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[
-                               '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
-                          ]
-        if __KeepImages__:
-            # the magnifier icon
-            remove_tags = [
-                dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})]
-        else:
-            remove_tags = [dict(name='img')]
-        remove_attributes = ['width']
-        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL | re.IGNORECASE),
-                               lambda match: ''),
-                              ]
-    elif __Region__ == 'Toronto':
-        if __UseChineseTitle__ is True:
-            title = u'\u660e\u5831 (\u591a\u502b\u591a)'
-        else:
-            title = 'Ming Pao - Toronto'
-        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
-        category = 'Chinese, News, Toronto'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'  # noqa
-        masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
-        keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
-                          dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[
-                               '3'], 'cellpadding':['3'], 'id':['tblContent3']}),
-                          dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[
-                               '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
-                          ]
-        if __KeepImages__:
-            # the magnifier icon
-            remove_tags = [
-                dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})]
-        else:
-            remove_tags = [dict(name='img')]
-        remove_attributes = ['width']
-        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL | re.IGNORECASE),
-                               lambda match: ''),
-                              ]
-
-    oldest_article = 1
-    max_articles_per_feed = 100
-    __author__ = 'Eddie Lau'
-    publisher = 'MingPao'
-    remove_javascript = True
-    use_embedded_content = False
-    no_stylesheets = True
-    language = 'zh'
-    encoding = 'Big5-HKSCS'
-    recursions = 0
-    conversion_options = {'linearize_tables': True}
-    timefmt = ''
-
-    def get_dtlocal(self):
-        dt_utc = datetime.datetime.utcnow()
-        if __Region__ == 'Hong Kong':
-            # convert UTC to local hk time - at HKT 4.30am, all news are
-            # available
-            dt_local = dt_utc + \
-                datetime.timedelta(8.0 / 24) - datetime.timedelta(4.5 / 24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
-        elif __Region__ == 'Vancouver':
-            # convert UTC to local Vancouver time - at PST time 5.30am, all
-            # news are available
-            dt_local = dt_utc + \
-                datetime.timedelta(-8.0 / 24) - datetime.timedelta(5.5 / 24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(5.5/24)
-        elif __Region__ == 'Toronto':
-            # convert UTC to local Toronto time - at EST time 8.30am, all news
-            # are available
-            dt_local = dt_utc + \
-                datetime.timedelta(-5.0 / 24) - datetime.timedelta(8.5 / 24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(8.5/24)
-        return dt_local
-
-    def get_fetchdate(self):
-        if __Date__ != '':
-            return __Date__
-        else:
-            return self.get_dtlocal().strftime("%Y%m%d")
-
-    def get_fetchformatteddate(self):
-        if __Date__ != '':
-            return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8]
-        else:
-            return self.get_dtlocal().strftime("%Y-%m-%d")
-
-    def get_fetchyear(self):
-        if __Date__ != '':
-            return __Date__[0:4]
-        else:
-            return self.get_dtlocal().strftime("%Y")
-
-    def get_fetchmonth(self):
-        if __Date__ != '':
-            return __Date__[4:6]
-        else:
-            return self.get_dtlocal().strftime("%m")
-
-    def get_fetchday(self):
-        if __Date__ != '':
-            return __Date__[6:8]
-        else:
-            return self.get_dtlocal().strftime("%d")
-
-    # Note: does not work with custom date given by __Date__
-    def get_weekday(self):
-        return self.get_dtlocal().weekday()
-
-    def get_cover_url(self):
-        if __Region__ == 'Hong Kong':
-            cover = 'http://news1.mingpao.com/' + self.get_fetchdate() + '/' + \
-                self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
-        elif __Region__ == 'Vancouver':
-            cover = 'http://www.mingpaovan.com/ftp/News/' + \
-                self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
-        elif __Region__ == 'Toronto':
-            cover = 'http://www.mingpaotor.com/ftp/News/' + \
-                self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
-        br = BasicNewsRecipe.get_browser(self)
-        try:
-            br.open(cover)
-        except:
-            cover = None
-        return cover
-
-    def parse_index(self):
-        feeds = []
-        dateStr = self.get_fetchdate()
-
-        if __Region__ == 'Hong Kong':
-            if __UseLife__:
-                for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
-                                           (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalgb', 'nal'),
-                                           (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalgf', 'nal'),
-                                           (u'\u793e\u8a55/\u7b46\u9663 Editorial',
-                                            'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalmr', 'nal'),
-                                           (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalfa', 'nal'),
-                                           (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalca', 'nal'),
-                                           (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalta', 'nal'),
-                                           (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalea', 'nal'),
-                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalsp', 'nal'),
-                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalma', 'nal')
-                                           ]:
-                    if __InclPremium__ is True:
-                        articles = self.parse_section2_txt(url, keystr)
-                    else:
-                        articles = self.parse_section2(url, keystr)
-                    if articles:
-                        feeds.append((title, articles))
-
-                # new
-                if __InclPremium__ is True and (self.get_weekday() != 6 or __ParseSelectedMobile__ is False):
-                    # if both not on Sunday and not __ParseSelectedMobile__, go ahead
-                    # parse column section articles directly from .txt files
-                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl')  # noqa
-                                               ]:
-                        articles = self.parse_section2_txt(url, keystr)
-                        if articles:
-                            feeds.append((title, articles))
-
-                if self.get_weekday() != 6:
-                    for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
-                        if __ParseTxt__ is False:
-                            articles = self.parse_section(url)
-                        else:
-                            articles = self.parse_section_txt(url, seckey)
-                        if articles:
-                            feeds.append((title, articles))
-                else:
-                    if __InclPremium__ is True and __ParseSelectedMobile__ is True:
-                        articles = self.parse_section_mobile(
-                            'http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1')
-                        if articles:
-                            feeds.append(
-                                (u'\u526f\u520a Supplement', articles))
-                    else:
-                        for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
-                            if __ParseTxt__ is False:
-                                articles = self.parse_section(url)
-                            else:
-                                articles = self.parse_section_txt(url, seckey)
-                            if articles:
-                                feeds.append((title, articles))
-
-                for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news1.mingpao.com/' + dateStr + '/emindex.htm', 'em')]:
-                    if __ParseTxt__ is False:
-                        articles = self.parse_section(url)
-                    else:
-                        articles = self.parse_section_txt(url, seckey)
-                    if articles:
-                        feeds.append((title, articles))
-                # end of new
-            else:
-                for title, url, seckey in [(u'\u8981\u805e Headline', 'http://news1.mingpao.com/' + dateStr + '/gaindex.htm', 'ga'),
-                                           (u'\u6e2f\u805e Local', 'http://news1.mingpao.com/' +
-                                            dateStr + '/gbindex.htm', 'gb'),
-                                           (u'\u6559\u80b2 Education', 'http://news1.mingpao.com/' +
-                                            dateStr + '/gfindex.htm', 'gf'),
-                                           (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news1.mingpao.com/' + dateStr + '/mrindex.htm', 'mr')]:
-                    if __ParseTxt__ is False:
-                        articles = self.parse_section(url)
-                    else:
-                        articles = self.parse_section_txt(url, seckey)
-                    if articles:
-                        feeds.append((title, articles))
-
-                # special- editorial
-                # ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
-                # if ed_articles:
-                #    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
-
-                for title, url, seckey in [(u'\u8ad6\u58c7 Forum', 'http://news1.mingpao.com/' + dateStr + '/faindex.htm', 'fa'),
-                                           (u'\u4e2d\u570b China', 'http://news1.mingpao.com/' +
-                                            dateStr + '/caindex.htm', 'ca'),
-                                           (u'\u570b\u969b World', 'http://news1.mingpao.com/' + dateStr + '/taindex.htm', 'ta')]:
-                    if __ParseTxt__ is False:
-                        articles = self.parse_section(url)
-                    else:
-                        articles = self.parse_section_txt(url, seckey)
-                    if articles:
-                        feeds.append((title, articles))
-
-                # special - finance
-                # fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
-                # fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
-                # if fin_articles:
-                #    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
-
-                for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
-                    articles = self.parse_section2_txt(url, keystr)
-                    if articles:
-                        feeds.append((title, articles))
-
-                # for title, url in [('Tech News', 'http://news1.mingpao.com/' + dateStr + '/naindex.htm'),
-                #                   (u'\u9ad4\u80b2 Sport', 'http://news1.mingpao.com/' + dateStr + '/spindex.htm')]:
-                #    articles = self.parse_section(url)
-                #    if articles:
-                #        feeds.append((title, articles))
-
-                # special - entertainment
-                # ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
-                # if ent_articles:
-                #    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
-
-                for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
-                                           ]:
-                    articles = self.parse_section2_txt(url, keystr)
-                    if articles:
-                        feeds.append((title, articles))
-
-                if __InclPremium__ is True and (self.get_weekday() != 6 or __ParseSelectedMobile__ is False):
-                    # if both not on Sunday or not __ParseSelectedMobile__, go ahead
-                    # parse column section articles directly from .txt files
-                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl')  # noqa
-                                               ]:
-                        articles = self.parse_section2_txt(url, keystr)
-                        if articles:
-                            feeds.append((title, articles))
-
-                if __InclPremium__ is False or self.get_weekday() != 6:
-                    for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
-                        if __ParseTxt__ is False:
-                            articles = self.parse_section(url)
-                        else:
-                            articles = self.parse_section_txt(url, seckey)
-                        if articles:
-                            feeds.append((title, articles))
-                else:
-                    if __InclPremium__ is True and __ParseSelectedMobile__ is True:
-                        articles = self.parse_section_mobile(
-                            'http://premium.mingpao.com/pda/palm', 'HotNews1.cfm?cat=ja&token=b218bc260b89c0&online=1')
-                        if articles:
-                            feeds.append(
-                                (u'\u526f\u520a Supplement', articles))
-                    else:
-                        for title, url, seckey in [(u'\u526f\u520a Supplement', 'http://news1.mingpao.com/' + dateStr + '/jaindex.htm', 'ja')]:
-                            if __ParseTxt__ is False:
-                                articles = self.parse_section(url)
-                        else:
-                            articles = self.parse_section_txt(url, seckey)
-                        if articles:
-                            feeds.append((title, articles))
-
-                for title, url, seckey in [(u'\u82f1\u6587 English', 'http://news1.mingpao.com/' + dateStr + '/emindex.htm', 'em')]:
-                    if __ParseTxt__ is False:
-                        articles = self.parse_section(url)
-                    else:
-                        articles = self.parse_section_txt(url, seckey)
-                    if articles:
-                        feeds.append((title, articles))
-
-        elif __Region__ == 'Vancouver':
-            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
-                               (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VBindex.htm'),
-                               (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VDindex.htm'),
-                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/HK-VGindex.htm'),
-                               (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VTindex.htm'),
-                               (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VCindex.htm'),
-                               (u'\u7d93\u6fdf Economics',
-                                'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
-                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VSindex.htm'),
-                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/HK-MAindex.htm'),
-                               (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'), ]:
-                articles = self.parse_section3(
-                    url, 'http://www.mingpaovan.com/')
-                if articles:
-                    feeds.append((title, articles))
-        elif __Region__ == 'Toronto':
-            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
-                               (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TDindex.htm'),
-                               (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TFindex.htm'),
-                               (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TCAindex.htm'),
-                               (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TTAindex.htm'),
-                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/HK-GAindex.htm'),
-                               (u'\u7d93\u6fdf Economics',
-                                'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
-                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TSindex.htm'),
-                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/HK-MAindex.htm'),
-                               (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'), ]:
-                articles = self.parse_section3(
-                    url, 'http://www.mingpaotor.com/')
-                if articles:
-                    feeds.append((title, articles))
-        return feeds
-
-    # parse from news1.mingpao.com (web html)
-    def parse_section(self, url):
-        dateStr = self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        divs = soup.findAll(attrs={'class': ['bullet', 'bullet_grey']})
-        current_articles = []
-        included_urls = []
-        divs.reverse()
-        for i in divs:
-            a = i.find('a', href=True)
-            title = self.tag_to_string(a)
-            url = a.get('href', False)
-            url = 'http://news1.mingpao.com/' + dateStr + '/' + url
-            # replace the url to the alternative version
-            if __ParsePF__ is True:
-                # printer-friendly option
-                if url.rfind('Redirect') != -1 and __InclPremium__ is True:
-                    url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
-                    url = re.sub('%2F.*%2F', '/', url)
-                    if __InclPremium__ is True:
-                        title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
-                    url = url.replace('%2Etxt', '_print.htm')
-                    url = url.replace('%5F', '_')
-                else:
-                    url = url.replace('.htm', '_print.htm')
-            # if url not in included_urls and url.rfind('Redirect') == -1 and
-            # (__InclPremium__ is False or
-            # title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
-            if url not in included_urls and (__InclPremium__ is True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': '', 'date': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    # parse from news1.mingpao.com (txt)
-    def parse_section_txt(self, url, ch):
-        dateStr = self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        divs = soup.findAll(attrs={'class': ['bullet', 'bullet_grey']})
-        current_articles = []
-        included_urls = []
-        divs.reverse()
-        for i in divs:
-            a = i.find('a', href=True)
-            title = self.tag_to_string(a)
-            url = a.get('href', False)
-            # print 'Base url: ', url
-            # replace the url to the alternative version
-            # text version
-            if url.rfind('Redirect') != -1:
-                url = 'http://news1.mingpao.com/' + dateStr + '/' + url
-                # print 'original url: ', url
-                url = re.sub(
-                    dateStr + '/../cfm/Redirect.cfm.*NewsFile=', 'ftp/WebNews2/', url)
-                url = re.sub('%2F', '/', url)
-                if __InclPremium__ is True:
-                    title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
-                url = url.replace('%2Etxt', '.txt')
-                url = url.replace('%5F', '_')
-            else:
-                # get the first two char in url as ch
-                seckey = url[0:2]
-                url = url.replace('.htm', '.txt')
-                url = 'http://news1.mingpao.com/ftp/WebNews2/' + \
-                    dateStr + '/' + ch + '/' + seckey + '/' + url
-            # print 'updated url: ', url
-
-            if url not in included_urls and (__InclPremium__ is True or title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
-                # if url not in included_urls and (url.rfind('Redirect') == -1)
-                # and (__InclPremium__ is False or
-                # title.rfind(u'\u6536\u8cbb\u5167\u5bb9') == -1):
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': '', 'date': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    # parse from life.mingpao.com
-    def parse_section2(self, url, keystr):
-        br = mechanize.Browser()
-        br.set_handle_redirect(False)
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
-                try:
-                    br.open_novisit(url)
-                    # use printed version of the article
-                    url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')
-                    current_articles.append(
-                        {'title': title, 'url': url, 'description': ''})
-                    included_urls.append(url)
-                except:
-                    print('skipping a premium article')
-        current_articles.reverse()
-        return current_articles
-
-    # parse from text file of life.mingpao.com
-    def parse_section2_txt(self, url, keystr):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
-                # use printed version of the article
-                url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/')
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    # parse from mobile version
-    def parse_section_mobile(self, base, page):
-        soup = self.index_to_soup(base + '/' + page)
-        a = soup.findAll('a', href=True)
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = i.get('href', False)
-            if url not in included_urls and url.rfind('HotNews2.cfm') != -1:
-                current_articles.append(
-                    {'title': title, 'url': base + '/' + url, 'description': ''})
-                included_urls.append(url)
-        return current_articles
-
-    # parse from www.mingpaovan.com
-    def parse_section3(self, url, baseUrl):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
-        current_articles = []
-        included_urls = []
-        divs.reverse()
-        for i in divs:
-            title = self.tag_to_string(i)
-            urlstr = i.get('href', False)
-            urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
-            if urlstr not in included_urls:
-                current_articles.append(
-                    {'title': title, 'url': urlstr, 'description': '', 'date': ''})
-                included_urls.append(urlstr)
-        current_articles.reverse()
-        return current_articles
-
-    def parse_ed_section(self, url):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    def parse_fin_section(self, url):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        current_articles = []
-        included_urls = []
-        for i in a:
-            # url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            # if url not in included_urls and not url.rfind(dateStr) == -1 and
-            # url.rfind('index') == -1:
-            if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
-                title = self.tag_to_string(i)
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        return current_articles
-
-    def parse_ent_section(self, url):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    def parse_col_section(self, url):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    # preprocess those .txt and javascript based files
-    def preprocess_raw_html(self, raw_html, url):
-        new_html = raw_html
-        if url.rfind('ftp') != -1 or url.rfind('_print.htm') != -1:
-            if url.rfind('_print.htm') != -1:
-                # javascript based file
-                splitter = re.compile(r'\n')
-                new_raw_html = '<html><head><title>Untitled</title></head>'
-                new_raw_html = new_raw_html + '<body>'
-                for item in splitter.split(raw_html):
-                    if item.startswith('var heading1 ='):
-                        heading = item.replace('var heading1 = \'', '')
-                        heading = heading.replace('\'', '')
-                        heading = heading.replace(';', '')
-                        new_raw_html = new_raw_html + '<div class="heading">' + heading
-                    if item.startswith('var heading2 ='):
-                        heading = item.replace('var heading2 = \'', '')
-                        heading = heading.replace('\'', '')
-                        heading = heading.replace(';', '')
-                        if heading != '':
-                            new_raw_html = new_raw_html + '<br>' + heading + '</div>'
-                        else:
-                            new_raw_html = new_raw_html + '</div>'
-                    if item.startswith('var content ='):
-                        content = item.replace("var content = ", '')
-                        content = content.replace('\'', '')
-                        content = content.replace(';', '')
-                        new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
-                    if item.startswith('var photocontent ='):
-                        photo = item.replace('var photocontent = \'', '')
-                        photo = photo.replace('\'', '')
-                        photo = photo.replace(';', '')
-                        photo = photo.replace('<tr>', '')
-                        photo = photo.replace('<td>', '')
-                        photo = photo.replace('</tr>', '')
-                        photo = photo.replace('</td>', '<br>')
-                        photo = photo.replace('class="photo"', '')
-                        new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
-                new_html = new_raw_html + '</body></html>'
-            else:
-                # .txt based file
-                splitter = re.compile(r'\n')  # Match non-digits
-                new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
-                next_is_img_txt = False
-                title_started = False
-                title_break_reached = False
-                met_article_start_char = False
-                for item in splitter.split(raw_html):
-                    item = item.strip()
-                    # if title already reached but break between title and
-                    # content not yet found, record title_break_reached
-                    if title_started is True and title_break_reached is False and item == '':
-                        title_break_reached = True
-                    # if title reached and title_break_reached and met_article_start_char is False and item is not empty
-                    # start content
-                    elif title_started is True and title_break_reached is True and met_article_start_char is False:
-                        if item != '':
-                            met_article_start_char = True
-                            new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
-                    # if item.startswith(u'\u3010'):
-                    #    met_article_start_char = True
-                    #    new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
-                    else:
-                        if next_is_img_txt is False:
-                            if item.startswith("=@"):
-                                print('skip movie link')
-                            elif item.startswith("=?"):
-                                next_is_img_txt = True
-                                new_raw_html += '<img src="' + \
-                                    str(item)[2:].strip() + '.gif" /><p>\n'
-                            elif item.startswith('=='):
-                                next_is_img_txt = True
-                                if False:
-                                    # TODO: check existence of .gif first
-                                    newimg = '_' + item[2:].strip() + '.jpg'
-                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
-                                else:
-                                    new_raw_html += '<img src="' + \
-                                        str(item)[2:].strip() + '.jpg" /><p>\n'
-                            elif item.startswith('='):
-                                next_is_img_txt = True
-                                if False:
-                                    # TODO: check existence of .gif first
-                                    newimg = '_' + item[1:].strip() + '.jpg'
-                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
-                                else:
-                                    new_raw_html += '<img src="' + \
-                                        str(item)[1:].strip() + '.jpg" /><p>\n'
-                            else:
-                                if next_is_img_txt is False and met_article_start_char is False:
-                                    if item != '':
-                                        if title_started is False:
-                                            # print 'Title started at ', item
-                                            new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
-                                            title_started = True
-                                        else:
-                                            new_raw_html = new_raw_html + item + '\n'
-                                else:
-                                    new_raw_html = new_raw_html + item + '<p>\n'
-                        else:
-                            next_is_img_txt = False
-                            new_raw_html = new_raw_html + item + '\n'
-                new_html = new_raw_html + '</div></body></html>'
-        if __HiResImg__ is True:
-            # TODO: add a _ in front of an image url
-            if url.rfind('news1.mingpao.com') > -1:
-                imglist = re.findall('src="?.*?jpg"', new_html)
-                br = mechanize.Browser()
-                br.set_handle_redirect(False)
-                for img in imglist:
-                    gifimg = img.replace('jpg"', 'gif"')
-                    try:
-                        br.open_novisit(
-                            url + "/../" + gifimg[5:len(gifimg) - 1])
-                        new_html = new_html.replace(img, gifimg)
-                    except:
-                        if __ParseTxt__ is False:
-                            # find the location of the first _
-                            pos = img.find('_')
-                            if pos > -1:
-                                # if found, insert _ after the first _
-                                newimg = img[0:pos] + '_' + img[pos:]
-                                new_html = new_html.replace(img, newimg)
-                            else:
-                                # if not found, insert _ after "
-                                new_html = new_html.replace(
-                                    img[1:], '"_' + img[1:])
-                        else:
-                            # insert to front
-                            # print 'imgstr: ', img
-                            pos = img.find('_')
-                            new_html = new_html.replace(img[5:], '_' + img[5:])
-
-            elif url.rfind('life.mingpao.com') > -1:
-                imglist = re.findall('src=\'?.*?jpg\'', new_html)
-                br = mechanize.Browser()
-                br.set_handle_redirect(False)
-                # print 'Img list: ', imglist, '\n'
-                for img in imglist:
-                    # print 'Found img: ', img
-                    gifimg = img.replace('jpg\'', 'gif\'')
-                    try:
-                        gifurl = re.sub(r'dailynews.*txt', '', url)
-                        br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1])
-                        new_html = new_html.replace(img, gifimg)
-                    except:
-                        pos = img.rfind('/')
-                        newimg = img[0:pos + 1] + '_' + img[pos + 1:]
-                        new_html = new_html.replace(img, newimg)
-                # repeat with src quoted by double quotes, for text parsed from
-                # src txt
-                imglist = re.findall('src="?.*?jpg"', new_html)
-                for img in imglist:
-                    # print 'Found img: ', img
-                    gifimg = img.replace('jpg"', 'gif"')
-                    try:
-                        # print 'url', url
-                        pos = url.rfind('/')
-                        gifurl = url[:pos + 1]
-                        # print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
-                        br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1])
-                        new_html = new_html.replace(img, gifimg)
-                    except:
-                        pos = img.find('"')
-                        newimg = img[0:pos + 1] + '_' + img[pos + 1:]
-                        # print 'Use hi-res img', newimg
-                        new_html = new_html.replace(img, newimg)
-        # test
-        # print new_html
-        return new_html
-
-    def preprocess_html(self, soup):
-        for mobiletitle in soup.findAll('font', attrs={'color': ['navy']}):
-            mobiletitle.name = 'h1'
-        for item in soup.findAll(style=True):
-            del item['style']
-        for item in soup.findAll(style=True):
-            del item['width']
-        for item in soup.findAll(stype=True):
-            del item['absmiddle']
-        return soup
-
-    def populate_article_metadata(self, article, soup, first):
-        if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'):
-            img = soup.find('img')
-            if img is not None:
-                self.add_toc_thumbnail(article, img['src'])
-
-        try:
-            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
-                # look for content
-                articlebodies = soup.findAll(
-                    'div', attrs={'id': 'newscontent'})
-                if not articlebodies:
-                    articlebodies = soup.findAll(
-                        'div', attrs={'id': 'newscontent01'})
-                if not articlebodies:
-                    articlebodies = soup.findAll(
-                        'div', attrs={'class': 'content'})
-                if not articlebodies:
-                    articlebodies = soup.findAll('div', attrs={'id': 'font'})
-                if articlebodies:
-                    for articlebody in articlebodies:
-                        if articlebody:
-                            # the text may or may not be enclosed in <p></p>
-                            # tag
-                            paras = articlebody.findAll('p')
-                            if not paras:
-                                paras = articlebody
-                            textFound = False
-                            for p in paras:
-                                if not textFound:
-                                    summary_candidate = self.tag_to_string(
-                                        p).strip()
-                                    summary_candidate = summary_candidate.replace(
-                                        u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
-                                    if len(summary_candidate) > 0:
-                                        article.summary = article.text_summary = summary_candidate
-                                        textFound = True
-            else:
-                # display a simple text
-                # article.summary = article.text_summary = u'\u66f4\u591a......'
-                # display word counts
-                counts = 0
-                articlebodies = soup.findAll(
-                    'div', attrs={'id': 'newscontent'})
-                if not articlebodies:
-                    articlebodies = soup.findAll(
-                        'div', attrs={'id': 'newscontent01'})
-                if not articlebodies:
-                    articlebodies = soup.findAll(
-                        'div', attrs={'class': 'content'})
-                if not articlebodies:
-                    articlebodies = soup.findAll('div', attrs={'id': 'font'})
-                if articlebodies:
-                    for articlebody in articlebodies:
-                        # the text may or may not be enclosed in <p></p> tag
-                        paras = articlebody.findAll('p')
-                        if not paras:
-                            paras = articlebody
-                        for p in paras:
-                            summary_candidate = self.tag_to_string(p).strip()
-                            counts += len(summary_candidate)
-                    article.summary = article.text_summary = u'\uff08' + \
-                        str(counts) + u'\u5b57\uff09'
-        except:
-            self.log("Error creating article descriptions")
-            return
-
-    # override from the one in version 0.8.31
-    def create_opf(self, feeds, dir=None):
-        if dir is None:
-            dir = self.output_dir
-        title = self.short_title()
-        # change 1: allow our own flag to tell if a periodical is to be generated
-        # also use customed date instead of current time
-        if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title:
-            title = title + ' ' + self.get_fetchformatteddate()
-        # end of change 1
-        # change 2: __appname__ replaced by newspaper publisher
-        __appname__ = self.publisher
-        mi = MetaInformation(title, [__appname__])
-        mi.publisher = __appname__
-        mi.author_sort = __appname__
-        # change 3: use __MakePeriodical__ flag to tell if a periodical should
-        # be generated
-        if __MakePeriodical__ is True:
-            mi.publication_type = 'periodical:' + \
-                self.publication_type + ':' + self.short_title()
-        else:
-            mi.publication_type = self.publication_type + ':' + self.short_title()
-        # change 4: in the following, all the nowf() are changed to adjusted time
-        # This one doesn't matter
-        mi.timestamp = nowf()
-        # change 5: skip listing the articles
-        # article_titles, aseen = [], set()
-        # for f in feeds:
-        #    for a in f:
-        #        if a.title and a.title not in aseen:
-        #            aseen.add(a.title)
-        #            article_titles.append(force_unicode(a.title, 'utf-8'))
-
-        # mi.comments = self.description
-        # if not isinstance(mi.comments, unicode):
-        #    mi.comments = mi.comments.decode('utf-8', 'replace')
-        # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
-        #        '\n\n'.join(article_titles))
-
-        language = canonicalize_lang(self.language)
-        if language is not None:
-            mi.language = language
-        # This one affects the pub date shown in kindle title
-        # mi.pubdate = nowf()
-        # now appears to need the time field to be > 12.00noon as well
-        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(
-            self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
-        opf_path = os.path.join(dir, 'index.opf')
-        ncx_path = os.path.join(dir, 'index.ncx')
-
-        opf = OPFCreator(dir, mi)
-        # Add mastheadImage entry to <guide> section
-        mp = getattr(self, 'masthead_path', None)
-        if mp is not None and os.access(mp, os.R_OK):
-            from calibre.ebooks.metadata.opf2 import Guide
-            ref = Guide.Reference(os.path.basename(
-                self.masthead_path), os.getcwd())
-            ref.type = 'masthead'
-            ref.title = 'Masthead Image'
-            opf.guide.append(ref)
-
-        manifest = [os.path.join(dir, 'feed_%d' % i)
-                    for i in range(len(feeds))]
-        manifest.append(os.path.join(dir, 'index.html'))
-        manifest.append(os.path.join(dir, 'index.ncx'))
-
-        # Get cover
-        cpath = getattr(self, 'cover_path', None)
-        if cpath is None:
-            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
-            if self.default_cover(pf):
-                cpath = pf.name
-        if cpath is not None and os.access(cpath, os.R_OK):
-            opf.cover = cpath
-            manifest.append(cpath)
-
-        # Get masthead
-        mpath = getattr(self, 'masthead_path', None)
-        if mpath is not None and os.access(mpath, os.R_OK):
-            manifest.append(mpath)
-
-        opf.create_manifest_from_files_in(manifest)
-        for mani in opf.manifest:
-            if mani.path.endswith('.ncx'):
-                mani.id = 'ncx'
-            if mani.path.endswith('mastheadImage.jpg'):
-                mani.id = 'masthead-image'
-
-        entries = ['index.html']
-        toc = TOC(base_path=dir)
-        self.play_order_counter = 0
-        self.play_order_map = {}
-
-        def feed_index(num, parent):
-            f = feeds[num]
-            for j, a in enumerate(f):
-                if getattr(a, 'downloaded', False):
-                    adir = 'feed_%d/article_%d/' % (num, j)
-                    auth = a.author
-                    if not auth:
-                        auth = None
-                    desc = a.text_summary
-                    if not desc:
-                        desc = None
-                    else:
-                        desc = self.description_limiter(desc)
-                    tt = a.toc_thumbnail if a.toc_thumbnail else None
-                    entries.append('%sindex.html' % adir)
-                    po = self.play_order_map.get(entries[-1], None)
-                    if po is None:
-                        self.play_order_counter += 1
-                        po = self.play_order_counter
-                    parent.add_item('%sindex.html' % adir, None,
-                                    a.title if a.title else _(
-                                        'Untitled Article'),
-                                    play_order=po, author=auth,
-                                    description=desc, toc_thumbnail=tt)
-                    last = os.path.join(
-                        self.output_dir, ('%sindex.html' % adir).replace('/', os.sep))
-                    for sp in a.sub_pages:
-                        prefix = os.path.commonprefix([opf_path, sp])
-                        relp = sp[len(prefix):]
-                        entries.append(relp.replace(os.sep, '/'))
-                        last = sp
-
-                    if os.path.exists(last):
-                        with open(last, 'rb') as fi:
-                            src = fi.read().decode('utf-8')
-                        soup = BeautifulSoup(src)
-                        body = soup.find('body')
-                        if body is not None:
-                            prefix = '/'.join('..'for i in range(2 *
-                                                                 len(re.findall(r'link\d+', last))))
-                            templ = self.navbar.generate(True, num, j, len(f),
-                                                         not self.has_single_feed,
-                                                         a.orig_url, __appname__, prefix=prefix,
-                                                         center=self.center_navbar)
-                            elem = BeautifulSoup(templ.render(
-                                doctype='xhtml').decode('utf-8')).find('div')
-                            body.insert(len(body.contents), elem)
-                            with open(last, 'wb') as fi:
-                                fi.write(type(u'')(soup).encode('utf-8'))
-        if len(feeds) == 0:
-            raise Exception('All feeds are empty, aborting.')
-
-        if len(feeds) > 1:
-            for i, f in enumerate(feeds):
-                entries.append('feed_%d/index.html' % i)
-                po = self.play_order_map.get(entries[-1], None)
-                if po is None:
-                    self.play_order_counter += 1
-                    po = self.play_order_counter
-                auth = getattr(f, 'author', None)
-                if not auth:
-                    auth = None
-                desc = getattr(f, 'description', None)
-                if not desc:
-                    desc = None
-                feed_index(i, toc.add_item('feed_%d/index.html' % i, None,
-                                           f.title, play_order=po, description=desc, author=auth))
-
-        else:
-            entries.append('feed_%d/index.html' % 0)
-            feed_index(0, toc)
-
-        for i, p in enumerate(entries):
-            entries[i] = os.path.join(dir, p.replace('/', os.sep))
-        opf.create_spine(entries)
-        opf.set_toc(toc)
-
-        with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file:
-            opf.render(opf_file, ncx_file)

From 2ca33f5da16a94615d1c7229a455f6a76d5e1635 Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:41:11 +0200
Subject: [PATCH 13/20] Delete nanfengchuang.recipe

---
 recipes/nanfengchuang.recipe | 106 -----------------------------------
 1 file changed, 106 deletions(-)
 delete mode 100644 recipes/nanfengchuang.recipe

diff --git a/recipes/nanfengchuang.recipe b/recipes/nanfengchuang.recipe
deleted file mode 100644
index 18a7be556f..0000000000
--- a/recipes/nanfengchuang.recipe
+++ /dev/null
@@ -1,106 +0,0 @@
-# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
-from __future__ import unicode_literals
-from calibre.web.feeds.recipes import BasicNewsRecipe
-from lxml import html
-
-__license__ = 'GPL v3'
-
-
-class Nfcmag(BasicNewsRecipe):
-
-    __author__ = '2014, Chen Wei <weichen302@gmx.com>'
-    title = 'Nan Feng Chuang / South Reviews Magazine'
-    description = '''
-South Reviews Magazine, established in 1985, is a Guangzhou-based political and
-economic biweekly. South Reviews enjoys a reputation of being fair and objective, with graceful
-narration, insightful expression among its readers, mostly government
-officials, economic leaders and intellectuals. It has been praised as “the No.1
-Political& Economical Magazine in China”.
-
-The US magazine Time described South Reviews as "a highbrow news magazine".
-Other international media organizations such as BBC and NHK have conducted
-tracking shots of South Reviews journalists, to record their unique value
-special position in China’s media industry. Harvard-Yenching Library, Stanford
-University's East Asia Library and UC Berkeley Library have collections of the
-magazine since its first issue, taking them as an important source to
-understand China's economic and social reform.
-
-Since 2008, South Reviews has been committed to transforming into a
-research-based media organization. Most of its editors, reporters and
-contributors have remarkably strong academic backgrounds, coming from Peking
-University, Tsinghua University, London School of Economics and Political
-Science, the Chinese University of Hong Kong, Renmin University of China, and
-other well-known institutions. The magazine has established research divisions,
-including the State Policy Research Center and the Brand Promotion Research
-Center, working in cooperation with well-known academic institutions and
-providing valuable research reports for governments and companies.
-
-'''
-    language = 'zh'
-    encoding = 'UTF-8'
-    publisher = 'South Reviews Magazine'
-    publication_type = 'magazine'
-    category = 'news, Business, China'
-    timefmt = ' [%a, %d %b, %Y]'
-    needs_subscription = False
-
-    remove_tags = [dict(attrs={'class': ['side-left', 'side-right',
-                                         'breadcrumbs', 'score', 'weboNav']}),
-                   dict(attrs={'id': ['header', 'footer']}),
-                   dict(name=['script', 'noscript', 'style'])]
-    no_stylesheets = True
-    remove_javascript = True
-    current_issue_url = ""
-    current_issue_cover = ""
-
-    def parse_index(self):
-
-        baseurl = 'http://www.nfcmag.com/'
-        raw = self.index_to_soup('http://www.nfcmag.com/magazine', raw=True)
-        soup_start = html.fromstring(raw)
-
-        els = soup_start.xpath("""//div[contains(@class, 'lastest-magazine')
-                         and contains(@class, 'comBox')]
-                         //a[@href and not(@id) and not(child::img)]
-                         """)
-        for x in els:
-            issueurl = x.get('href')
-            if not issueurl.lower().startswith('http://'):
-                issueurl = baseurl + issueurl
-            break
-
-        raw = self.index_to_soup(issueurl, raw=True)
-        soup_issue = html.fromstring(raw)
-
-        coverimg = soup_issue.xpath("""//div[contains(@class, 'lastest-magazine')
-                         and contains(@class, 'comBox')]
-                         //img[@*] """)
-        imgurl = coverimg[0].get('src')
-        if not imgurl.lower().startswith('http://'):
-            imgurl = baseurl + imgurl
-        self.current_issue_cover = imgurl
-        feeds = []
-
-        sections = soup_issue.xpath("""//div[contains(@class, 'article-box')
-                         and contains(@class, 'comBox')] """)
-        for sec in sections:
-            pages = sec.xpath('.//h5')
-            sec_title = sec.xpath('.//h4')[0].text_content()
-            self.log('Found section:', sec_title)
-            articles = []
-            for x in pages:
-                url = x.xpath('.//a')[0].get('href')
-                if not url.lower().startswith('http://'):
-                    url = baseurl + url
-                url = url[:-5] + '-s.html'  # to print view
-
-                title = x.text_content()
-
-                articles.append({'title': title, 'url': url, 'date': None})
-
-            if articles:
-                feeds.append((sec_title, articles))
-        return feeds
-
-    def get_cover_url(self):
-        return self.current_issue_cover

From 7814e8b8ec8fc8c57f2adc297b5d6fcb4c57d120 Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:41:28 +0200
Subject: [PATCH 14/20] Delete ming_pao_vancouver.recipe

---
 recipes/ming_pao_vancouver.recipe | 1018 -----------------------------
 1 file changed, 1018 deletions(-)
 delete mode 100644 recipes/ming_pao_vancouver.recipe

diff --git a/recipes/ming_pao_vancouver.recipe b/recipes/ming_pao_vancouver.recipe
deleted file mode 100644
index f8b1e9309d..0000000000
--- a/recipes/ming_pao_vancouver.recipe
+++ /dev/null
@@ -1,1018 +0,0 @@
-from __future__ import print_function
-__license__ = 'GPL v3'
-__copyright__ = '2010-2011, Eddie Lau'
-
-# Region - Hong Kong, Vancouver, Toronto
-__Region__ = 'Vancouver'
-# Users of Kindle 3 with limited system-level CJK support
-# please replace the following "True" with "False". (Default: True)
-__MakePeriodical__ = True
-# Turn below to True if your device supports display of CJK titles
-# (Default: False)
-__UseChineseTitle__ = False
-# Set it to False if you want to skip images (Default: True)
-__KeepImages__ = True
-# Set it to True if you want to include a summary in Kindle's article view
-# (Default: False)
-__IncludeSummary__ = False
-# Set it to True if you want thumbnail images in Kindle's article view
-# (Default: True)
-__IncludeThumbnails__ = True
-# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
-__UseLife__ = True
-# (HK only) It is to disable premium content (Default: False)
-__InclPremium__ = False
-# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
-__ParsePFF__ = True
-# (HK only) Turn below to True if you wish hi-res images (Default: False)
-__HiResImg__ = False
-# Override the date returned by the program if specifying a YYYYMMDD below
-__Date__ = ''
-
-
-'''
-Change Log:
-2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
-            from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
-            download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
-2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
-2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
-2011/10/19: fix a bug in txt source parsing
-2011/10/17: disable fetching of premium content, also improved txt source parsing
-2011/10/04: option to get hi-res photos for the articles
-2011/09/21: fetching "column" section is made optional.
-2011/09/18: parse "column" section stuff from source text file directly.
-2011/09/07: disable "column" section as it is no longer offered free.
-2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
-            provide options to remove all images in the file
-2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
-2011/03/06: add new articles for finance section, also a new section "Columns"
-2011/02/28: rearrange the sections
-            [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
-            View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues"
-            folder in Kindle 3
-2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
-            clean up the indentation
-2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
-            (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
-2010/11/22: add English section, remove eco-news section which is not updated daily, correct
-            ordering of articles
-2010/11/12: add news image and eco-news section
-2010/11/08: add parsing of finance section
-2010/11/06: temporary work-around for Kindle device having no capability to display unicode
-            in section/article list.
-2010/10/31: skip repeated articles in section pages
-'''
-
-from calibre.utils.date import now as nowf
-import os
-import datetime
-import re
-import mechanize
-from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
-from calibre.ebooks.metadata.opf2 import OPFCreator
-from calibre.ebooks.metadata.toc import TOC
-from calibre.ebooks.metadata import MetaInformation
-from calibre.utils.localization import canonicalize_lang
-
-# MAIN CLASS
-
-
-class MPRecipe(BasicNewsRecipe):
-    if __Region__ == 'Hong Kong':
-        if __UseChineseTitle__ is True:
-            title = u'\u660e\u5831 (\u9999\u6e2f)'
-        else:
-            title = 'Ming Pao - Hong Kong'
-        description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
-        category = 'Chinese, News, Hong Kong'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'  # noqa
-        masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
-        keep_only_tags = [dict(name='h1'),
-                          # for entertainment page title
-                          dict(name='font', attrs={
-                               'style': ['font-size:14pt; line-height:160%;']}),
-                          # for column articles title
-                          dict(name='font', attrs={'color': ['AA0000']}),
-                          # for heading from txt
-                          dict(attrs={'class': ['heading']}),
-                          # entertainment and column page content
-                          dict(attrs={'id': ['newscontent']}),
-                          dict(
-                              attrs={'id': ['newscontent01', 'newscontent02']}),
-                          # for content from txt
-                          dict(attrs={'class': ['content']}),
-                          dict(attrs={'class': ['photo']}),
-                          dict(name='table', attrs={'width': ['100%'], 'border':['0'], 'cellspacing':[
-                               '5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
-                          # images for source from life.mingpao.com
-                          dict(name='img', attrs={
-                               'width': ['180'], 'alt':['????']}),
-                          # for images from txt
-                          dict(attrs={'class': ['images']})
-                          ]
-        if __KeepImages__:
-            remove_tags = [dict(name='style'),
-                           # for the finance page from mpfinance.com
-                           dict(attrs={'id': ['newscontent135']}),
-                           # article date in life.mingpao.com article
-                           dict(name='font', attrs={
-                                'size': ['2'], 'color':['666666']}),
-                           # dict(name='table')  # for content fetched from
-                           # life.mingpao.com
-                           ]
-        else:
-            remove_tags = [dict(name='style'),
-                           # for the finance page from mpfinance.com
-                           dict(attrs={'id': ['newscontent135']}),
-                           # article date in life.mingpao.com article
-                           dict(name='font', attrs={
-                                'size': ['2'], 'color':['666666']}),
-                           dict(name='img'),
-                           # dict(name='table')  # for content fetched from
-                           # life.mingpao.com
-                           ]
-        remove_attributes = ['width']
-        preprocess_regexps = [
-            (re.compile(r'<h5>', re.DOTALL | re.IGNORECASE),
-             lambda match: '<h1>'),
-            (re.compile(r'</h5>', re.DOTALL | re.IGNORECASE),
-             lambda match: '</h1>'),
-            (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL | re.IGNORECASE),  # for entertainment page
-             lambda match: ''),
-            # skip <br> after title in life.mingpao.com fetched article
-            (re.compile(r"<div id='newscontent'><br>", re.DOTALL | re.IGNORECASE),
-             lambda match: "<div id='newscontent'>"),
-            (re.compile(r"<br><br></b>", re.DOTALL | re.IGNORECASE),
-             lambda match: "</b>")
-        ]
-    elif __Region__ == 'Vancouver':
-        if __UseChineseTitle__ is True:
-            title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
-        else:
-            title = 'Ming Pao - Vancouver'
-        description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
-        category = 'Chinese, News, Vancouver'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'  # noqa
-        masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
-        keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
-                          dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[
-                               '3'], 'cellpadding':['3'], 'id':['tblContent3']}),
-                          dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[
-                               '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
-                          ]
-        if __KeepImages__:
-            # the magnifier icon
-            remove_tags = [
-                dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})]
-        else:
-            remove_tags = [dict(name='img')]
-        remove_attributes = ['width']
-        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL | re.IGNORECASE),
-                               lambda match: ''),
-                              ]
-    elif __Region__ == 'Toronto':
-        if __UseChineseTitle__ is True:
-            title = u'\u660e\u5831 (\u591a\u502b\u591a)'
-        else:
-            title = 'Ming Pao - Toronto'
-        description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
-        category = 'Chinese, News, Toronto'
-        extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'  # noqa
-        masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
-        keep_only_tags = [dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
-                          dict(name='table', attrs={'width': ['450'], 'border':['0'], 'cellspacing':[
-                               '3'], 'cellpadding':['3'], 'id':['tblContent3']}),
-                          dict(name='table', attrs={'width': ['180'], 'border':['0'], 'cellspacing':[
-                               '0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
-                          ]
-        if __KeepImages__:
-            # the magnifier icon
-            remove_tags = [
-                dict(name='img', attrs={'src': ['../../../image/magnifier.gif']})]
-        else:
-            remove_tags = [dict(name='img')]
-        remove_attributes = ['width']
-        preprocess_regexps = [(re.compile(r'&nbsp;', re.DOTALL | re.IGNORECASE),
-                               lambda match: ''),
-                              ]
-
-    oldest_article = 1
-    max_articles_per_feed = 100
-    __author__ = 'Eddie Lau'
-    publisher = 'MingPao'
-    remove_javascript = True
-    use_embedded_content = False
-    no_stylesheets = True
-    language = 'zh'
-    encoding = 'Big5-HKSCS'
-    recursions = 0
-    conversion_options = {'linearize_tables': True}
-    timefmt = ''
-
-    def get_dtlocal(self):
-        dt_utc = datetime.datetime.utcnow()
-        if __Region__ == 'Hong Kong':
-            # convert UTC to local hk time - at HKT 4.30am, all news are
-            # available
-            dt_local = dt_utc + \
-                datetime.timedelta(8.0 / 24) - datetime.timedelta(4.5 / 24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
-        elif __Region__ == 'Vancouver':
-            # convert UTC to local Vancouver time - at PST time 5.30am, all
-            # news are available
-            dt_local = dt_utc + \
-                datetime.timedelta(-8.0 / 24) - datetime.timedelta(5.5 / 24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(5.5/24)
-        elif __Region__ == 'Toronto':
-            # convert UTC to local Toronto time - at EST time 8.30am, all news
-            # are available
-            dt_local = dt_utc + \
-                datetime.timedelta(-5.0 / 24) - datetime.timedelta(8.5 / 24)
-            # dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(8.5/24)
-        return dt_local
-
-    def get_fetchdate(self):
-        if __Date__ != '':
-            return __Date__
-        else:
-            return self.get_dtlocal().strftime("%Y%m%d")
-
-    def get_fetchformatteddate(self):
-        if __Date__ != '':
-            return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8]
-        else:
-            return self.get_dtlocal().strftime("%Y-%m-%d")
-
-    def get_fetchyear(self):
-        if __Date__ != '':
-            return __Date__[0:4]
-        else:
-            return self.get_dtlocal().strftime("%Y")
-
-    def get_fetchmonth(self):
-        if __Date__ != '':
-            return __Date__[4:6]
-        else:
-            return self.get_dtlocal().strftime("%m")
-
-    def get_fetchday(self):
-        if __Date__ != '':
-            return __Date__[6:8]
-        else:
-            return self.get_dtlocal().strftime("%d")
-
-    def get_cover_url(self):
-        if __Region__ == 'Hong Kong':
-            cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + \
-                '_' + self.get_fetchday() + 'gacov.jpg'
-        elif __Region__ == 'Vancouver':
-            cover = 'http://www.mingpaovan.com/ftp/News/' + \
-                self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
-        elif __Region__ == 'Toronto':
-            cover = 'http://www.mingpaotor.com/ftp/News/' + \
-                self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
-        br = BasicNewsRecipe.get_browser(self)
-        try:
-            br.open(cover)
-        except:
-            cover = None
-        return cover
-
-    def parse_index(self):
-        feeds = []
-        dateStr = self.get_fetchdate()
-
-        if __Region__ == 'Hong Kong':
-            if __UseLife__:
-                for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
-                                           (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalgb', 'nal'),
-                                           (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalgf', 'nal'),
-                                           (u'\u793e\u8a55/\u7b46\u9663 Editorial',
-                                            'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalmr', 'nal'),
-                                           (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalfa', 'nal'),
-                                           (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalca', 'nal'),
-                                           (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalta', 'nal'),
-                                           (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalea', 'nal'),
-                                           (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalsp', 'nal'),
-                                           (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' +
-                                            dateStr + '&Category=nalma', 'nal')
-                                           ]:
-                    if __InclPremium__ is True:
-                        articles = self.parse_section2_txt(url, keystr)
-                    else:
-                        articles = self.parse_section2(url, keystr)
-                    if articles:
-                        feeds.append((title, articles))
-
-                if __InclPremium__ is True:
-                    # parse column section articles directly from .txt files
-                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl')  # noqa
-                                               ]:
-                        articles = self.parse_section2_txt(url, keystr)
-                        if articles:
-                            feeds.append((title, articles))
-
-                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
-                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
-                    articles = self.parse_section(url)
-                    if articles:
-                        feeds.append((title, articles))
-            else:
-                for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
-                                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' +
-                                    dateStr + '/gbindex.htm'),
-                                   (u'\u6559\u80b2 Education',
-                                    'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
-                                   (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
-                    articles = self.parse_section(url)
-                    if articles:
-                        feeds.append((title, articles))
-
-                # special- editorial
-                # ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
-                # if ed_articles:
-                #    feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
-
-                for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
-                                   (u'\u4e2d\u570b China', 'http://news.mingpao.com/' +
-                                    dateStr + '/caindex.htm'),
-                                   (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
-                    articles = self.parse_section(url)
-                    if articles:
-                        feeds.append((title, articles))
-
-                # special - finance
-                # fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
-                # fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
-                # if fin_articles:
-                #    feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
-
-                for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
-                    articles = self.parse_section2_txt(url, keystr)
-                    if articles:
-                        feeds.append((title, articles))
-
-                # for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
-                #                   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
-                #    articles = self.parse_section(url)
-                #    if articles:
-                #        feeds.append((title, articles))
-
-                # special - entertainment
-                # ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
-                # if ent_articles:
-                #    feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
-
-                for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
-                                           ]:
-                    articles = self.parse_section2_txt(url, keystr)
-                    if articles:
-                        feeds.append((title, articles))
-
-                if __InclPremium__ is True:
-                    # parse column section articles directly from .txt files
-                    for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=ncolumn', 'ncl')  # noqa
-                                               ]:
-                        articles = self.parse_section2_txt(url, keystr)
-                        if articles:
-                            feeds.append((title, articles))
-
-                for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
-                                   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
-                    articles = self.parse_section(url)
-                    if articles:
-                        feeds.append((title, articles))
-
-        elif __Region__ == 'Vancouver':
-            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
-                               (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VBindex.htm'),
-                               (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VDindex.htm'),
-                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/HK-VGindex.htm'),
-                               (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VTindex.htm'),
-                               (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VCindex.htm'),
-                               (u'\u7d93\u6fdf Economics',
-                                'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
-                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/VSindex.htm'),
-                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' +
-                                dateStr + '/HK-MAindex.htm'),
-                               (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'), ]:
-                articles = self.parse_section3(
-                    url, 'http://www.mingpaovan.com/')
-                if articles:
-                    feeds.append((title, articles))
-        elif __Region__ == 'Toronto':
-            for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
-                               (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TDindex.htm'),
-                               (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TFindex.htm'),
-                               (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TCAindex.htm'),
-                               (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TTAindex.htm'),
-                               (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/HK-GAindex.htm'),
-                               (u'\u7d93\u6fdf Economics',
-                                'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
-                               (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/TSindex.htm'),
-                               (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' +
-                                dateStr + '/HK-MAindex.htm'),
-                               (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'), ]:
-                articles = self.parse_section3(
-                    url, 'http://www.mingpaotor.com/')
-                if articles:
-                    feeds.append((title, articles))
-        return feeds
-
-    # parse from news.mingpao.com
-    def parse_section(self, url):
-        dateStr = self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        divs = soup.findAll(attrs={'class': ['bullet', 'bullet_grey']})
-        current_articles = []
-        included_urls = []
-        divs.reverse()
-        for i in divs:
-            a = i.find('a', href=True)
-            title = self.tag_to_string(a)
-            url = a.get('href', False)
-            url = 'http://news.mingpao.com/' + dateStr + '/' + url
-            # replace the url to the print-friendly version
-            if __ParsePFF__ is True:
-                if url.rfind('Redirect') != -1 and __InclPremium__ is True:
-                    url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
-                    url = re.sub('%2F.*%2F', '/', url)
-                    title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
-                    url = url.replace('%2Etxt', '_print.htm')
-                    url = url.replace('%5F', '_')
-                else:
-                    url = url.replace('.htm', '_print.htm')
-            if url not in included_urls and url.rfind('Redirect') == -1:
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': '', 'date': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    # parse from life.mingpao.com
-    def parse_section2(self, url, keystr):
-        br = mechanize.Browser()
-        br.set_handle_redirect(False)
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
-                try:
-                    br.open_novisit(url)
-                    # use printed version of the article
-                    url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')
-                    current_articles.append(
-                        {'title': title, 'url': url, 'description': ''})
-                    included_urls.append(url)
-                except:
-                    print('skipping a premium article')
-        current_articles.reverse()
-        return current_articles
-
-    # parse from text file of life.mingpao.com
-    def parse_section2_txt(self, url, keystr):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
-                # use printed version of the article
-                url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/')
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    # parse from www.mingpaovan.com
-    def parse_section3(self, url, baseUrl):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
-        current_articles = []
-        included_urls = []
-        divs.reverse()
-        for i in divs:
-            title = self.tag_to_string(i)
-            urlstr = i.get('href', False)
-            urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
-            if urlstr not in included_urls:
-                current_articles.append(
-                    {'title': title, 'url': urlstr, 'description': '', 'date': ''})
-                included_urls.append(urlstr)
-        current_articles.reverse()
-        return current_articles
-
-    def parse_ed_section(self, url):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    def parse_fin_section(self, url):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        current_articles = []
-        included_urls = []
-        for i in a:
-            # url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            # if url not in included_urls and not url.rfind(dateStr) == -1 and
-            # url.rfind('index') == -1:
-            if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
-                title = self.tag_to_string(i)
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        return current_articles
-
-    def parse_ent_section(self, url):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    def parse_col_section(self, url):
-        self.get_fetchdate()
-        soup = self.index_to_soup(url)
-        a = soup.findAll('a', href=True)
-        a.reverse()
-        current_articles = []
-        included_urls = []
-        for i in a:
-            title = self.tag_to_string(i)
-            url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
-            if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
-                current_articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-                included_urls.append(url)
-        current_articles.reverse()
-        return current_articles
-
-    # preprocess those .txt and javascript based files
-    def preprocess_raw_html(self, raw_html, url):
-        new_html = raw_html
-        if url.rfind('ftp') != -1 or url.rfind('_print.htm') != -1:
-            if url.rfind('_print.htm') != -1:
-                # javascript based file
-                splitter = re.compile(r'\n')
-                new_raw_html = '<html><head><title>Untitled</title></head>'
-                new_raw_html = new_raw_html + '<body>'
-                for item in splitter.split(raw_html):
-                    if item.startswith('var heading1 ='):
-                        heading = item.replace('var heading1 = \'', '')
-                        heading = heading.replace('\'', '')
-                        heading = heading.replace(';', '')
-                        new_raw_html = new_raw_html + '<div class="heading">' + heading
-                    if item.startswith('var heading2 ='):
-                        heading = item.replace('var heading2 = \'', '')
-                        heading = heading.replace('\'', '')
-                        heading = heading.replace(';', '')
-                        if heading != '':
-                            new_raw_html = new_raw_html + '<br>' + heading + '</div>'
-                        else:
-                            new_raw_html = new_raw_html + '</div>'
-                    if item.startswith('var content ='):
-                        content = item.replace("var content = ", '')
-                        content = content.replace('\'', '')
-                        content = content.replace(';', '')
-                        new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
-                    if item.startswith('var photocontent ='):
-                        photo = item.replace('var photocontent = \'', '')
-                        photo = photo.replace('\'', '')
-                        photo = photo.replace(';', '')
-                        photo = photo.replace('<tr>', '')
-                        photo = photo.replace('<td>', '')
-                        photo = photo.replace('</tr>', '')
-                        photo = photo.replace('</td>', '<br>')
-                        photo = photo.replace('class="photo"', '')
-                        new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
-                new_html = new_raw_html + '</body></html>'
-            else:
-                # .txt based file
-                splitter = re.compile(r'\n')  # Match non-digits
-                new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
-                next_is_img_txt = False
-                title_started = False
-                title_break_reached = False
-                met_article_start_char = False
-                for item in splitter.split(raw_html):
-                    item = item.strip()
-                    # if title already reached but break between title and
-                    # content not yet found, record title_break_reached
-                    if title_started is True and title_break_reached is False and item == '':
-                        title_break_reached = True
-                    # if title reached and title_break_reached and met_article_start_char is False and item is not empty
-                    # start content
-                    elif title_started is True and title_break_reached is True and met_article_start_char is False:
-                        if item != '':
-                            met_article_start_char = True
-                            new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
-                    # if item.startswith(u'\u3010'):
-                    #    met_article_start_char = True
-                    #    new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
-                    else:
-                        if next_is_img_txt is False:
-                            if item.startswith("=@"):
-                                print('skip movie link')
-                            elif item.startswith("=?"):
-                                next_is_img_txt = True
-                                new_raw_html += '<img src="' + \
-                                    str(item)[2:].strip() + '.gif" /><p>\n'
-                            elif item.startswith('=='):
-                                next_is_img_txt = True
-                                if False:
-                                    # TODO: check existence of .gif first
-                                    newimg = '_' + item[2:].strip() + '.jpg'
-                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
-                                else:
-                                    new_raw_html += '<img src="' + \
-                                        str(item)[2:].strip() + '.jpg" /><p>\n'
-                            elif item.startswith('='):
-                                next_is_img_txt = True
-                                if False:
-                                    # TODO: check existence of .gif first
-                                    newimg = '_' + item[1:].strip() + '.jpg'
-                                    new_raw_html += '<img src="' + newimg + '" /><p>\n'
-                                else:
-                                    new_raw_html += '<img src="' + \
-                                        str(item)[1:].strip() + '.jpg" /><p>\n'
-                            else:
-                                if next_is_img_txt is False and met_article_start_char is False:
-                                    if item != '':
-                                        if title_started is False:
-                                            # print 'Title started at ', item
-                                            new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
-                                            title_started = True
-                                        else:
-                                            new_raw_html = new_raw_html + item + '\n'
-                                else:
-                                    new_raw_html = new_raw_html + item + '<p>\n'
-                        else:
-                            next_is_img_txt = False
-                            new_raw_html = new_raw_html + item + '\n'
-                new_html = new_raw_html + '</div></body></html>'
-        if __HiResImg__ is True:
-            # TODO: add a _ in front of an image url
-            if url.rfind('news.mingpao.com') > -1:
-                imglist = re.findall('src="?.*?jpg"', new_html)
-                br = mechanize.Browser()
-                br.set_handle_redirect(False)
-                for img in imglist:
-                    gifimg = img.replace('jpg"', 'gif"')
-                    try:
-                        br.open_novisit(
-                            url + "/../" + gifimg[5:len(gifimg) - 1])
-                        new_html = new_html.replace(img, gifimg)
-                    except:
-                        # find the location of the first _
-                        pos = img.find('_')
-                        if pos > -1:
-                            # if found, insert _ after the first _
-                            newimg = img[0:pos] + '_' + img[pos:]
-                            new_html = new_html.replace(img, newimg)
-                        else:
-                            # if not found, insert _ after "
-                            new_html = new_html.replace(
-                                img[1:], '"_' + img[1:])
-            elif url.rfind('life.mingpao.com') > -1:
-                imglist = re.findall('src=\'?.*?jpg\'', new_html)
-                br = mechanize.Browser()
-                br.set_handle_redirect(False)
-                # print 'Img list: ', imglist, '\n'
-                for img in imglist:
-                    # print 'Found img: ', img
-                    gifimg = img.replace('jpg\'', 'gif\'')
-                    try:
-                        gifurl = re.sub(r'dailynews.*txt', '', url)
-                        br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1])
-                        new_html = new_html.replace(img, gifimg)
-                    except:
-                        pos = img.rfind('/')
-                        newimg = img[0:pos + 1] + '_' + img[pos + 1:]
-                        new_html = new_html.replace(img, newimg)
-                # repeat with src quoted by double quotes, for text parsed from
-                # src txt
-                imglist = re.findall('src="?.*?jpg"', new_html)
-                for img in imglist:
-                    # print 'Found img: ', img
-                    gifimg = img.replace('jpg"', 'gif"')
-                    try:
-                        # print 'url', url
-                        pos = url.rfind('/')
-                        gifurl = url[:pos + 1]
-                        # print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
-                        br.open_novisit(gifurl + gifimg[5:len(gifimg) - 1])
-                        new_html = new_html.replace(img, gifimg)
-                    except:
-                        pos = img.find('"')
-                        newimg = img[0:pos + 1] + '_' + img[pos + 1:]
-                        # print 'Use hi-res img', newimg
-                        new_html = new_html.replace(img, newimg)
-        return new_html
-
-    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']
-        for item in soup.findAll(style=True):
-            del item['width']
-        for item in soup.findAll(stype=True):
-            del item['absmiddle']
-        return soup
-
-    def populate_article_metadata(self, article, soup, first):
-        # thumbnails shouldn't be available if using hi-res images
-        if __IncludeThumbnails__ and __HiResImg__ is False and first and hasattr(self, 'add_toc_thumbnail'):
-            img = soup.find('img')
-            if img is not None:
-                self.add_toc_thumbnail(article, img['src'])
-
-        try:
-            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
-                # look for content
-                articlebodies = soup.findAll(
-                    'div', attrs={'id': 'newscontent'})
-                if not articlebodies:
-                    articlebodies = soup.findAll(
-                        'div', attrs={'id': 'newscontent01'})
-                if not articlebodies:
-                    articlebodies = soup.findAll(
-                        'div', attrs={'class': 'content'})
-                if not articlebodies:
-                    articlebodies = soup.findAll('div', attrs={'id': 'font'})
-                if articlebodies:
-                    for articlebody in articlebodies:
-                        if articlebody:
-                            # the text may or may not be enclosed in <p></p>
-                            # tag
-                            paras = articlebody.findAll('p')
-                            if not paras:
-                                paras = articlebody
-                            textFound = False
-                            for p in paras:
-                                if not textFound:
-                                    summary_candidate = self.tag_to_string(
-                                        p).strip()
-                                    summary_candidate = summary_candidate.replace(
-                                        u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
-                                    if len(summary_candidate) > 0:
-                                        article.summary = article.text_summary = summary_candidate
-                                        textFound = True
-            else:
-                # display a simple text
-                # article.summary = article.text_summary = u'\u66f4\u591a......'
-                # display word counts
-                counts = 0
-                articlebodies = soup.findAll(
-                    'div', attrs={'id': 'newscontent'})
-                if not articlebodies:
-                    articlebodies = soup.findAll(
-                        'div', attrs={'id': 'newscontent01'})
-                if not articlebodies:
-                    articlebodies = soup.findAll(
-                        'div', attrs={'class': 'content'})
-                if not articlebodies:
-                    articlebodies = soup.findAll('div', attrs={'id': 'font'})
-                if articlebodies:
-                    for articlebody in articlebodies:
-                        # the text may or may not be enclosed in <p></p> tag
-                        paras = articlebody.findAll('p')
-                        if not paras:
-                            paras = articlebody
-                        for p in paras:
-                            summary_candidate = self.tag_to_string(p).strip()
-                            counts += len(summary_candidate)
-                    article.summary = article.text_summary = u'\uff08' + \
-                        str(counts) + u'\u5b57\uff09'
-        except:
-            self.log("Error creating article descriptions")
-            return
-
-    # override from the one in version 0.8.31
-    def create_opf(self, feeds, dir=None):
-        if dir is None:
-            dir = self.output_dir
-        title = self.short_title()
-        # change 1: allow our own flag to tell if a periodical is to be generated
-        # also use customed date instead of current time
-        if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title:
-            title = title + ' ' + self.get_fetchformatteddate()
-        # end of change 1
-        # change 2: __appname__ replaced by newspaper publisher
-        __appname__ = self.publisher
-        mi = MetaInformation(title, [__appname__])
-        mi.publisher = __appname__
-        mi.author_sort = __appname__
-        # change 3: use __MakePeriodical__ flag to tell if a periodical should
-        # be generated
-        if __MakePeriodical__ is True:
-            mi.publication_type = 'periodical:' + \
-                self.publication_type + ':' + self.short_title()
-        else:
-            mi.publication_type = self.publication_type + ':' + self.short_title()
-        # mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
-        # change 4: in the following, all the nowf() are changed to adjusted time
-        # This one doesn't matter
-        mi.timestamp = nowf()
-        # change 5: skip listing the articles
-        # article_titles, aseen = [], set()
-        # for f in feeds:
-        #    for a in f:
-        #        if a.title and a.title not in aseen:
-        #            aseen.add(a.title)
-        #            article_titles.append(force_unicode(a.title, 'utf-8'))
-
-        # mi.comments = self.description
-        # if not isinstance(mi.comments, unicode):
-        #    mi.comments = mi.comments.decode('utf-8', 'replace')
-        # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
-        #        '\n\n'.join(article_titles))
-
-        language = canonicalize_lang(self.language)
-        if language is not None:
-            mi.language = language
-        # This one affects the pub date shown in kindle title
-        # mi.pubdate = nowf()
-        # now appears to need the time field to be > 12.00noon as well
-        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(
-            self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
-        opf_path = os.path.join(dir, 'index.opf')
-        ncx_path = os.path.join(dir, 'index.ncx')
-
-        opf = OPFCreator(dir, mi)
-        # Add mastheadImage entry to <guide> section
-        mp = getattr(self, 'masthead_path', None)
-        if mp is not None and os.access(mp, os.R_OK):
-            from calibre.ebooks.metadata.opf2 import Guide
-            ref = Guide.Reference(os.path.basename(
-                self.masthead_path), os.getcwd())
-            ref.type = 'masthead'
-            ref.title = 'Masthead Image'
-            opf.guide.append(ref)
-
-        manifest = [os.path.join(dir, 'feed_%d' % i)
-                    for i in range(len(feeds))]
-        manifest.append(os.path.join(dir, 'index.html'))
-        manifest.append(os.path.join(dir, 'index.ncx'))
-
-        # Get cover
-        cpath = getattr(self, 'cover_path', None)
-        if cpath is None:
-            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
-            if self.default_cover(pf):
-                cpath = pf.name
-        if cpath is not None and os.access(cpath, os.R_OK):
-            opf.cover = cpath
-            manifest.append(cpath)
-
-        # Get masthead
-        mpath = getattr(self, 'masthead_path', None)
-        if mpath is not None and os.access(mpath, os.R_OK):
-            manifest.append(mpath)
-
-        opf.create_manifest_from_files_in(manifest)
-        for mani in opf.manifest:
-            if mani.path.endswith('.ncx'):
-                mani.id = 'ncx'
-            if mani.path.endswith('mastheadImage.jpg'):
-                mani.id = 'masthead-image'
-
-        entries = ['index.html']
-        toc = TOC(base_path=dir)
-        self.play_order_counter = 0
-        self.play_order_map = {}
-
-        def feed_index(num, parent):
-            f = feeds[num]
-            for j, a in enumerate(f):
-                if getattr(a, 'downloaded', False):
-                    adir = 'feed_%d/article_%d/' % (num, j)
-                    auth = a.author
-                    if not auth:
-                        auth = None
-                    desc = a.text_summary
-                    if not desc:
-                        desc = None
-                    else:
-                        desc = self.description_limiter(desc)
-                    tt = a.toc_thumbnail if a.toc_thumbnail else None
-                    entries.append('%sindex.html' % adir)
-                    po = self.play_order_map.get(entries[-1], None)
-                    if po is None:
-                        self.play_order_counter += 1
-                        po = self.play_order_counter
-                    parent.add_item('%sindex.html' % adir, None,
-                                    a.title if a.title else _(
-                                        'Untitled Article'),
-                                    play_order=po, author=auth,
-                                    description=desc, toc_thumbnail=tt)
-                    last = os.path.join(
-                        self.output_dir, ('%sindex.html' % adir).replace('/', os.sep))
-                    for sp in a.sub_pages:
-                        prefix = os.path.commonprefix([opf_path, sp])
-                        relp = sp[len(prefix):]
-                        entries.append(relp.replace(os.sep, '/'))
-                        last = sp
-
-                    if os.path.exists(last):
-                        with open(last, 'rb') as fi:
-                            src = fi.read().decode('utf-8')
-                        soup = BeautifulSoup(src)
-                        body = soup.find('body')
-                        if body is not None:
-                            prefix = '/'.join('..'for i in range(2 *
-                                                                 len(re.findall(r'link\d+', last))))
-                            templ = self.navbar.generate(True, num, j, len(f),
-                                                         not self.has_single_feed,
-                                                         a.orig_url, __appname__, prefix=prefix,
-                                                         center=self.center_navbar)
-                            elem = BeautifulSoup(templ.render(
-                                doctype='xhtml').decode('utf-8')).find('div')
-                            body.insert(len(body.contents), elem)
-                            with open(last, 'wb') as fi:
-                                fi.write(type(u'')(soup).encode('utf-8'))
-        if len(feeds) == 0:
-            raise Exception('All feeds are empty, aborting.')
-
-        if len(feeds) > 1:
-            for i, f in enumerate(feeds):
-                entries.append('feed_%d/index.html' % i)
-                po = self.play_order_map.get(entries[-1], None)
-                if po is None:
-                    self.play_order_counter += 1
-                    po = self.play_order_counter
-                auth = getattr(f, 'author', None)
-                if not auth:
-                    auth = None
-                desc = getattr(f, 'description', None)
-                if not desc:
-                    desc = None
-                feed_index(i, toc.add_item('feed_%d/index.html' % i, None,
-                                           f.title, play_order=po, description=desc, author=auth))
-
-        else:
-            entries.append('feed_%d/index.html' % 0)
-            feed_index(0, toc)
-
-        for i, p in enumerate(entries):
-            entries[i] = os.path.join(dir, p.replace('/', os.sep))
-        opf.create_spine(entries)
-        opf.set_toc(toc)
-
-        with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file:
-            opf.render(opf_file, ncx_file)

From 4cfd52708a73731052a8f569dd90f3fc3e9d01f1 Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:41:44 +0200
Subject: [PATCH 15/20] Delete cnd_weekly.recipe

---
 recipes/cnd_weekly.recipe | 74 ---------------------------------------
 1 file changed, 74 deletions(-)
 delete mode 100644 recipes/cnd_weekly.recipe

diff --git a/recipes/cnd_weekly.recipe b/recipes/cnd_weekly.recipe
deleted file mode 100644
index 7566ec9548..0000000000
--- a/recipes/cnd_weekly.recipe
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/env  python2
-
-__license__ = 'GPL v3'
-__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
-'''
-cnd.org
-'''
-import re
-
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class TheCND(BasicNewsRecipe):
-
-    title = 'CND Weekly'
-    __author__ = 'Derek Liang'
-    description = ''
-    INDEX = 'http://cnd.org'
-    language = 'zh'
-    conversion_options = {'linearize_tables': True}
-
-    remove_tags_before = dict(name='div', id='articleHead')
-    remove_tags_after = dict(id='copyright')
-    remove_tags = [dict(name='table', attrs={'align': 'right'}), dict(name='img', attrs={
-        'src': 'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
-    no_stylesheets = True
-
-    preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
-                          (re.compile('<table width.*?</table>',
-                                      re.DOTALL), lambda m: ''),
-                          ]
-
-    def print_version(self, url):
-        if url.find('news/article.php') >= 0:
-            return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
-        else:
-            return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
-
-    def parse_index(self):
-        soup = self.index_to_soup(self.INDEX)
-
-        feeds = []
-        articles = {}
-
-        for a in soup.findAll('a', attrs={'target': '_cnd'}):
-            url = a['href']
-            if url.find('article.php') < 0:
-                continue
-            if url.startswith('/'):
-                url = 'http://cnd.org' + url
-            title = self.tag_to_string(a)
-            date = a.nextSibling
-            if not re.search('cm', date):
-                continue
-            self.log('\tFound article: ', title, 'at', url, '@', date)
-            if (date is not None) and len(date) > 2:
-                if date not in articles:
-                    articles[date] = []
-                articles[date].append(
-                    {'title': title, 'url': url, 'description': '', 'date': ''})
-                self.log('\t\tAppend to : ', date)
-
-        sorted_articles = sorted(articles)
-        while sorted_articles:
-            mostCurrent = sorted_articles.pop()
-            self.title = 'CND ' + mostCurrent
-            feeds.append((self.title, articles[mostCurrent]))
-
-        return feeds
-
-    def populate_article_metadata(self, article, soup, first):
-        header = soup.find('h3')
-        self.log('header: ' + self.tag_to_string(header))
-        pass

From 4e1bbdbd0180ae6e0a20b5418310520162084a81 Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:42:00 +0200
Subject: [PATCH 16/20] Delete cnd.recipe

---
 recipes/cnd.recipe | 73 ----------------------------------------------
 1 file changed, 73 deletions(-)
 delete mode 100644 recipes/cnd.recipe

diff --git a/recipes/cnd.recipe b/recipes/cnd.recipe
deleted file mode 100644
index 320f355e01..0000000000
--- a/recipes/cnd.recipe
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env  python2
-
-__license__ = 'GPL v3'
-__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
-'''
-cnd.org
-'''
-import re
-
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class TheCND(BasicNewsRecipe):
-
-    title = 'CND'
-    __author__ = 'Derek Liang'
-    description = ''
-    INDEX = 'http://cnd.org'
-    language = 'zh'
-    conversion_options = {'linearize_tables': True}
-
-    remove_tags_before = dict(name='div', id='articleHead')
-    remove_tags_after = dict(id='copyright')
-    remove_tags = [dict(name='table', attrs={'align': 'right'}), dict(name='img', attrs={
-        'src': 'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
-    no_stylesheets = True
-
-    preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
-                          (re.compile('<table width.*?</table>',
-                                      re.DOTALL), lambda m: ''),
-                          ]
-
-    def print_version(self, url):
-        if url.find('news/article.php') >= 0:
-            return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
-        else:
-            return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
-
-    def parse_index(self):
-        soup = self.index_to_soup(self.INDEX)
-
-        feeds = []
-        articles = {}
-
-        for a in soup.findAll('a', attrs={'target': '_cnd'}):
-            url = a['href']
-            if url.find('article.php') < 0:
-                continue
-            if url.startswith('/'):
-                url = 'http://cnd.org' + url
-            title = self.tag_to_string(a)
-            self.log('\tFound article: ', title, 'at', url)
-            date = a.nextSibling
-            if re.search('cm', date):
-                continue
-            if (date is not None) and len(date) > 2:
-                if date not in articles:
-                    articles[date] = []
-                articles[date].append(
-                    {'title': title, 'url': url, 'description': '', 'date': ''})
-                self.log('\t\tAppend to : ', date)
-
-        mostCurrent = sorted(articles).pop()
-        self.title = 'CND ' + mostCurrent
-
-        feeds.append((self.title, articles[mostCurrent]))
-
-        return feeds
-
-    def populate_article_metadata(self, article, soup, first):
-        header = soup.find('h3')
-        self.log('header: ' + self.tag_to_string(header))
-        pass

From f4e807cab02f344eabf1875f4a0563bcac9e77eb Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:42:16 +0200
Subject: [PATCH 17/20] Delete caijing.recipe

---
 recipes/caijing.recipe | 90 ------------------------------------------
 1 file changed, 90 deletions(-)
 delete mode 100644 recipes/caijing.recipe

diff --git a/recipes/caijing.recipe b/recipes/caijing.recipe
deleted file mode 100644
index e0825862b2..0000000000
--- a/recipes/caijing.recipe
+++ /dev/null
@@ -1,90 +0,0 @@
-import re
-from calibre.web.feeds.recipes import BasicNewsRecipe
-
-__license__ = 'GPL v3'
-
-
-class Caijing(BasicNewsRecipe):
-
-    '''based on the recipe wrote by Eric Chen at 2011'''
-
-    __author__ = '2014, Chen Wei <weichen302@gmx.com>'
-    title = 'Caijing Magazine'
-    description = '''
-    Founded in 1998, the fortnightly CAIJING Magazine has firmly established
-    itself as a news authority and leading voice for business and financial
-    issues in China.
-
-    CAIJING Magazine closely tracks the most important aspects of China's
-    economic reforms, developments and policy changes, as well as major events
-    in the capital markets. It also offers a broad international perspective
-    through first-hand reporting on international political and economic
-    issues.
-
-    CAIJING Magazine is China's most widely read business and finance magazine,
-    with a circulation of 225,000 per issue. It boasts top-level readers from
-    government, business and academic circles.'''
-    language = 'zh'
-    encoding = 'UTF-8'
-    publisher = 'Caijing Magazine'
-    publication_type = 'magazine'
-    category = 'news, Business, China'
-    timefmt = ' [%a, %d %b, %Y]'
-    needs_subscription = True
-
-    remove_tags = [dict(attrs={'class': ['head_nav', 'mcont_logo', 'header',
-                                         'bottom', 'footer', 'magazine_ipad', 'cjartShare', 'ar_about',
-                                         'main_rt', 'mcont_nav', 'new']}),
-                   dict(attrs={'id': ['articlePl']}),
-                   dict(name=['script', 'noscript', 'style'])]
-    no_stylesheets = True
-    remove_javascript = True
-    current_issue_url = ""
-    current_issue_cover = ""
-
-    def get_browser(self):
-        br = BasicNewsRecipe.get_browser(self)
-        if self.username is not None and self.password is not None:
-            br.open('http://service.caijing.com.cn/usermanage/login')
-            br.select_form(name='mainLoginForm')
-            br['username'] = self.username
-            br['password'] = self.password
-            br.submit()
-        return br
-
-    def parse_index(self):
-        soup_start = self.index_to_soup('http://magazine.caijing.com.cn/')
-        jumpurl = soup_start.find('script').contents[0].split()
-        for line in jumpurl:
-            if 'http' in line.lower():
-                issuesurl = line.split('"')[1]
-                break
-
-        soup_issues = self.index_to_soup(issuesurl)
-        # find the latest issue
-        div = soup_issues.find('div', attrs={'class': 'fmcon'})
-        current_issue_url = div.find('a', href=True)['href']
-
-        soup = self.index_to_soup(current_issue_url)
-        coverimg = soup.find('div', {'class': 'zzfm_img'})
-        self.current_issue_cover = coverimg.find('img')['src']
-
-        feeds = []
-        for section in soup.findAll('div',
-                                    attrs={'class': re.compile(r'(fmwz_ml|zzlm_nr)2?$')}):
-            section_title = self.tag_to_string(section.find('div',
-                                                            attrs={'class': re.compile(r'(lmnav_bt|zzlm_bt)1?$')}))
-            self.log('Found section:', section_title)
-            articles = []
-            for post in section.findAll('div',
-                                        attrs={'class': re.compile(r'(fmwz_bt|zzlm_nr_bt)')}):
-                title = self.tag_to_string(post)
-                url = post.find('a')['href']
-                articles.append({'title': title, 'url': url, 'date': None})
-
-            if articles:
-                feeds.append((section_title, articles))
-        return feeds
-
-    def get_cover_url(self):
-        return self.current_issue_cover

From 98777bf9da5e0c2ce4ef2ef71903529ddb1dcd86 Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:42:46 +0200
Subject: [PATCH 18/20] Delete bbc_chinese.recipe

---
 recipes/bbc_chinese.recipe | 46 --------------------------------------
 1 file changed, 46 deletions(-)
 delete mode 100644 recipes/bbc_chinese.recipe

diff --git a/recipes/bbc_chinese.recipe b/recipes/bbc_chinese.recipe
deleted file mode 100644
index 61531db3c9..0000000000
--- a/recipes/bbc_chinese.recipe
+++ /dev/null
@@ -1,46 +0,0 @@
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class AdvancedUserRecipe1277443634(BasicNewsRecipe):
-    title = u'BBC Chinese'
-    oldest_article = 7
-    max_articles_per_feed = 100
-
-    feeds = [
-        (u'\u4e3b\u9875', u'http://www.bbc.co.uk/zhongwen/simp/index.xml'),
-        (u'\u56fd\u9645\u65b0\u95fb',
-         u'http://www.bbc.co.uk/zhongwen/simp/world/index.xml'),
-        (u'\u4e24\u5cb8\u4e09\u5730',
-         u'http://www.bbc.co.uk/zhongwen/simp/china/index.xml'),
-        (u'\u91d1\u878d\u8d22\u7ecf',
-         u'http://www.bbc.co.uk/zhongwen/simp/business/index.xml'),
-        (u'\u7f51\u4e0a\u4e92\u52a8',
-         u'http://www.bbc.co.uk/zhongwen/simp/interactive/index.xml'),
-        (u'\u97f3\u89c6\u56fe\u7247',
-         u'http://www.bbc.co.uk/zhongwen/simp/multimedia/index.xml'),
-        (u'\u5206\u6790\u8bc4\u8bba',
-         u'http://www.bbc.co.uk/zhongwen/simp/indepth/index.xml')
-    ]
-    extra_css = '''
-        @font-face {font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\n
-    body {margin-right: 8pt; font-family: 'DroidFont', serif;}\n
-                    h1 {font-family: 'DroidFont', serif;}\n
-                    .articledescription {font-family: 'DroidFont', serif;}
-            '''
-    __author__ = 'rty'
-    __version__ = '1.0'
-    language = 'zh'
-    pubisher = 'British Broadcasting Corporation'
-    description = 'BBC news in Chinese'
-    category = 'News, Chinese'
-    remove_javascript = True
-    use_embedded_content = False
-    no_stylesheets = True
-    encoding = 'UTF-8'
-    conversion_options = {'linearize_tables': True}
-    masthead_url = 'http://wscdn.bbc.co.uk/zhongwen/simp/images/1024/brand.jpg'
-    keep_only_tags = [
-        dict(name='h1'),
-        dict(name='p', attrs={'class': ['primary-topic', 'summary']}),
-        dict(name='div', attrs={'class': ['bodytext', 'datestamp']}),
-    ]

From bad72317420c65076e875a90cac534b7ab6f16fa Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:43:52 +0200
Subject: [PATCH 19/20] Delete elperiodico_catalan.recipe

---
 recipes/elperiodico_catalan.recipe | 73 ------------------------------
 1 file changed, 73 deletions(-)
 delete mode 100644 recipes/elperiodico_catalan.recipe

diff --git a/recipes/elperiodico_catalan.recipe b/recipes/elperiodico_catalan.recipe
deleted file mode 100644
index 6d8be7749a..0000000000
--- a/recipes/elperiodico_catalan.recipe
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env  python2
-# -*- coding: utf-8 -*-
-
-__license__ = 'GPL v3'
-__copyright__ = '30 October 2010, Jordi Balcells based on an earlier recipe by Darko Miletic <darko.miletic at gmail.com>'
-'''
-elperiodico.cat
-'''
-
-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag
-
-
-def new_tag(soup, name, attrs=()):
-    impl = getattr(soup, 'new_tag', None)
-    if impl is not None:
-        return impl(name, attrs=dict(attrs))
-    return Tag(soup, name, attrs=attrs or None)
-
-
-class ElPeriodico_cat(BasicNewsRecipe):
-    title = 'El Periodico de Catalunya'
-    __author__ = 'Jordi Balcells/Darko Miletic'
-    description = 'Noticies des de Catalunya'
-    publisher = 'elperiodico.cat'
-    category = 'news, politics, Spain, Catalunya'
-    oldest_article = 2
-    max_articles_per_feed = 100
-    no_stylesheets = True
-    use_embedded_content = False
-    delay = 1
-    encoding = 'cp1252'
-    language = 'ca'
-
-    html2lrf_options = [
-        '--comment', description, '--category', category, '--publisher', publisher
-    ]
-
-    html2epub_options = 'publisher="' + publisher + \
-        '"\ncomments="' + description + '"\ntags="' + category + '"'
-
-    feeds = [(u'Portada', u'http://www.elperiodico.cat/ca/rss/rss_portada.xml'),
-             (u'Internacional', u'http://www.elperiodico.cat/ca/rss/internacional/rss.xml'),
-             (u'Societat', u'http://www.elperiodico.cat/ca/rss/societat/rss.xml'),
-             (u'Ci\xe8ncia i tecnologia',
-              u'http://www.elperiodico.cat/ca/rss/ciencia-i-tecnologia/rss.xml'),
-             (u'Esports', u'http://www.elperiodico.cat/ca/rss/esports/rss.xml'),
-             (u'Gent', u'http://www.elperiodico.cat/ca/rss/gent/rss.xml'),
-             (u'Opini\xf3', u'http://www.elperiodico.cat/ca/rss/opinio/rss.xml'),
-             (u'Pol\xedtica', u'http://www.elperiodico.cat/ca/rss/politica/rss.xml'),
-             (u'Barcelona', u'http://www.elperiodico.cat/ca/rss/barcelona/rss.xml'),
-             (u'Economia', u'http://www.elperiodico.cat/ca/rss/economia/rss.xml'),
-             (u'Cultura i espectacles',
-              u'http://www.elperiodico.cat/ca/rss/cultura-i-espectacles/rss.xml'),
-             (u'Tele', u'http://www.elperiodico.cat/ca/rss/tele/rss.xml')]
-
-    keep_only_tags = [dict(name='div', attrs={'class': 'titularnoticia'}),
-                      dict(name='div', attrs={'class': 'noticia_completa'})]
-
-    remove_tags = [dict(name='div', attrs={'class': ['opcionb', 'opcionb last', 'columna_noticia']}),
-                   dict(name='span', attrs={'class': 'opcionesnoticia'})
-                   ]
-
-    def print_version(self, url):
-        return url.replace('/default.asp?', '/print.asp?')
-
-    def preprocess_html(self, soup):
-        mcharset = new_tag(soup, 'meta', [
-                       ("http-equiv", "Content-Type"), ("content", "text/html; charset=utf-8")])
-        soup.head.insert(0, mcharset)
-        for item in soup.findAll(style=True):
-            del item['style']
-        return soup

From f3111e2a72fcc21b0ee5b873e80ebce15486e64a Mon Sep 17 00:00:00 2001
From: Jony <23194385+jony0008@users.noreply.github.com>
Date: Sun, 29 Mar 2020 19:45:49 +0200
Subject: [PATCH 20/20] Delete china_press.recipe

---
 recipes/china_press.recipe | 69 --------------------------------------
 1 file changed, 69 deletions(-)
 delete mode 100644 recipes/china_press.recipe

diff --git a/recipes/china_press.recipe b/recipes/china_press.recipe
deleted file mode 100644
index 39946e9490..0000000000
--- a/recipes/china_press.recipe
+++ /dev/null
@@ -1,69 +0,0 @@
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-class AdvancedUserRecipe1277228948(BasicNewsRecipe):
-    title = u'China Press USA'
-    oldest_article = 7
-    max_articles_per_feed = 100
-
-    __author__ = 'rty'
-    __version__ = '1.0'
-    language = 'zh'
-    pubisher = 'www.chinapressusa.com'
-    description = 'Overseas Chinese Network Newspaper in the USA'
-    category = 'News in Chinese, USA'
-    remove_javascript = True
-    use_embedded_content = False
-    no_stylesheets = True
-    encoding = 'UTF-8'
-    conversion_options = {'linearize_tables': True}
-    masthead_url = 'http://www.chinapressusa.com/common/images/logo.gif'
-    extra_css = '''
-             @font-face { font-family: "DroidFont", serif, sans-serif;  src: url(res:///system/fonts/DroidSansFallback.ttf); }\n
-             body {
-                  margin-right: 8pt;
-                  font-family: 'DroidFont', serif;}
-              h1  {font-family: 'DroidFont', serif, sans-serif}
-            .show {font-family: 'DroidFont', serif, sans-serif}
-            '''
-    feeds = [
-        (u'\u65b0\u95fb\u9891\u9053', u'http://news.uschinapress.com/news.xml'),
-        (u'\u534e\u4eba\u9891\u9053', u'http://chinese.uschinapress.com/chinese.xml'),
-        (u'\u8bc4\u8bba\u9891\u9053', u'http://review.uschinapress.com/review.xml'),
-    ]
-    keep_only_tags = [
-        dict(name='div', attrs={'class': 'show'}),
-    ]
-    remove_tags = [
-        #               dict(name='table', attrs={'class':'xle'}),
-        dict(name='div', attrs={'class': 'time'}),
-    ]
-    remove_tags_after = [
-        dict(name='div', attrs={'class': 'bank17'}),
-        #         dict(name='a', attrs={'class':'ab12'}),
-    ]
-
-    def append_page(self, soup, appendtag, position):
-        pager = soup.find('div', attrs={'id': 'displaypagenum'})
-        if pager:
-            nexturl = self.INDEX + pager.a['href']
-            soup2 = self.index_to_soup(nexturl)
-            texttag = soup2.find('div', attrs={'class': 'show'})
-            for it in texttag.findAll(style=True):
-                del it['style']
-            newpos = len(texttag.contents)
-            self.append_page(soup2, texttag, newpos)
-            texttag.extract()
-            appendtag.insert(position, texttag)
-
-    def preprocess_html(self, soup):
-        mtag = '<meta http-equiv="Content-Language" content="zh-CN"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
-        soup.head.insert(0, mtag)
-
-        for item in soup.findAll(style=True):
-            del item['style']
-        self.append_page(soup, soup.body, 3)
-        pager = soup.find('div', attrs={'id': 'displaypagenum'})
-        if pager:
-            pager.extract()
-        return soup