diff --git a/recipes/houston_chronicle.recipe b/recipes/houston_chronicle.recipe deleted file mode 100644 index 2f65c07fb4..0000000000 --- a/recipes/houston_chronicle.recipe +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -from __future__ import print_function - -__license__ = 'GPL v3' -__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com' -''' -chron.com -''' -import re -import sys -import time -import traceback -from collections import OrderedDict -from datetime import datetime - -from calibre.ebooks.BeautifulSoup import NavigableString -from calibre.utils.cleantext import clean_ascii_chars -from calibre.utils.date import dt_factory, local_tz, utcfromtimestamp -from calibre.web.feeds.recipes import BasicNewsRecipe - -regex_date_only = re.compile(r"""(?:January|February|March|April| - {8}May|June|July|August|September|October|November| - {8}December)\s[0-9]{1,2},\s20[01][0-9]""") -regex_time_only = re.compile(r"""[0-9]{1,2}:[0-9]{1,2} \w{2}""") -sentence_regex = re.compile(r"(\S.+?[.!?])(?=\s+|$)") -blog_regex = re.compile(r'post-\d+') - -pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])), - ('business', ('/business/', ['sports'])), - ('sports', ('/sports/', ['business']))]) - -base_url = "http://www.chron.com" - -# sports has 'core-package sports' class -xpath_general = """//div[contains(@class, 'centerpiece-tabs') or - contains(@class, 'wrapper') or - contains(@class, 'contentGroups') or - contains(@class, 'headline-list') or - contains(@class, 'core-package sports') or - contains(@class, 'news')] - //a[contains(@class, 'hdn-analytics')]""" - -excluded_titles = ["Winning numbers", "TV-radio listings"] - - -def validate_link(page, link, title): - other_category = page[1][1] - if not title or len(title.strip()) < 5: - print("{0} rejected, title too short".format(link)) - return None - parts = link.split('/') - if len(parts) > 3 and parts[3] in other_category: - print("{0} rejected, covered in other section".format(link)) - return None - for excluded_title in excluded_titles: - if title.find(excluded_title) != -1: - print("{0} rejected, excluded title".format(link)) - return None - return link, title - - -def get_article_parsed(index_to_soup, this_url): - return index_to_soup(this_url, as_tree=True) - - -def sort_subject(element_list): - # priority of subjects - subjects = ['news', 'neighborhood', 'entertainment'] - subjects.reverse() - subject_dict = OrderedDict(zip(subjects, range(len(subjects)))) - rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)]) - for element in element_list: - try: - subj = element[0].split('/')[3] - except Exception: - subj = 'unknown' - if subject_dict.get(subj) is not None: - rank_dict[subject_dict[subj] + 1].append(element) - else: - rank_dict[0].append(element) - # now return in reverse order, sorted - combined_list = [] - for rank in range(len(subjects), -1, -1): - article_list = rank_dict[rank] - article_list.sort() - combined_list.extend(article_list) - return combined_list - - -def get_links_from_section_page(index_to_soup, page): - page_doc = get_article_parsed(index_to_soup, base_url + page[1][0]) - els = page_doc.xpath(xpath_general) - element_list = [] - for el in els: - link = el.get('href').split('?')[0] - title = el.text - if title is None or len(title.strip()) < 5: - link_id = link.split('/')[-1][:-3].split('-')[:-1] - title = ' '.join(link_id) - if link[:4] != 'http': - link = base_url + link - validated_link = validate_link(page=page, link=link, title=title) - if validated_link is not None: - element_list.append(validated_link) - sorted_element_list = sort_subject(element_list) - return [page[0], sorted_element_list] - - -def get_all_links_from_sections(index_to_soup): - all_sections = [] - article_set = set() - final_dict = OrderedDict() - for item in pages.items(): - print("getting links from {0}".format(item[0])) - all_sections.append(get_links_from_section_page(index_to_soup, item)) - for section in all_sections: - section_id = section[0] - article_list = section[1] - final_dict[section_id] = [] - for article in article_list: - if article[0] not in article_set: - article_set.add(article[0]) - final_dict[section_id].append(article) - return final_dict - - -# noinspection PyAbstractClass -class HoustonChronicle(BasicNewsRecipe): - title = u'The Houston Chronicle' - description = 'News from Houston, Texas' - __author__ = 'Dale Furrow' - language = 'en' - no_stylesheets = True - remove_attributes = ['style', 'xmlns'] - remove_empty_feeds = True - timefmt = '[%a, %d %b %Y]' - timestampfmt = '%Y%m%d%H%M%S' - # ignore_duplicate_articles = {'url'} # defaults to None - extra_css = '.article_date {display: none}' - category = 'news, USA' - masthead_url = 'http://www.chron.com/apple-touch-icon-76x76.png' - keep_only_tags = [dict(name='div', attrs={'class': ['article-content', 'article-wrap']})] - remove_tags = [dict(name='div', attrs={'social-title': True}), - dict(name='div', attrs={'class': - ['control-panel', 'gallery-overlay-inner', - 'most-popular', 'asset-media mos-playlist', - 'asset_media asset-media']}), - dict(name='li', attrs={'class': ['hst-resgalleryitem taboola-frame hidden', - 'hst-resgalleryitem hidden']}), - dict(name='ul', attrs={'class': 'clearfix'})] - - # max_articles_per_feed = 5 # for use in testing - - def get_article_description_from_doc(self, soup): - description_chars_break = 140 - description_max_chars = 300 - try: - els = soup.findAll('p') - if len(els) > 0: - out_text = "" - this_ellipsis = "" - for el in els: - if el is not None: - result = [] - for descendant in el.contents: - if isinstance(descendant, NavigableString): - result.append(type(u'')(descendant).strip()) - all_text = u' '.join(result) - if len(all_text) > 1: - sentences = re.findall(sentence_regex, all_text) - if sentences is not None and len(sentences) > 0: - for sentence in sentences: - if len(out_text) < description_chars_break: - out_text += sentence + " " - else: - if len(out_text) > description_max_chars: - this_ellipsis = "..." - return out_text[:description_max_chars] + this_ellipsis - return out_text - else: - return "No Article description returned" - except Exception as ex: - self.log('Error on Article Description') - traceback.print_exc(file=sys.stdout) - print(str(ex)) - return "" - - @staticmethod - def get_published_time_from_doc(page_doc): - - def get_regular_timestamp(date_string): - try: - out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ") - return out_date - except ValueError: - return None - - el = page_doc.findAll( - lambda this_tag: this_tag.name == "time" and ('itemprop', 'datePublished') in this_tag.attrs) - if len(el) == 1: - return get_regular_timestamp(el[0].get('datetime')) - else: - return None - - def populate_article_metadata(self, article, soup, first): - """ - Called when each HTML page belonging to article is downloaded. - Intended to be used to get article metadata like author/summary/etc. - from the parsed HTML (soup). - - :param article: A object of class :class:`calibre.web.feeds.Article`. - If you change the summary, remember to also change the text_summary - :param soup: Parsed HTML belonging to this article - :param first: True iff the parsed HTML is the first page of the article. - """ - summary = self.get_article_description_from_doc(soup) - article_date = self.get_published_time_from_doc(soup) - if article_date is not None: - article_timestamp = float((article_date - utcfromtimestamp(0)).total_seconds()) - article.date = article_timestamp - article.utctime = dt_factory(article_date.timetuple(), assume_utc=True, as_utc=True) - article.localtime = article.utctime.astimezone(local_tz) - summary_date = article.localtime.strftime("%Y-%m-%d %H:%M") if article_date is not None else "No Date" - article.summary = "{0}: {1}".format(summary_date, summary) - article.text_summary = clean_ascii_chars(article.summary) - - def parse_index(self): - self.timefmt = ' [%a, %d %b, %Y]' - self.log('starting parse_index: ', time.strftime(self.timestampfmt)) - feeds = [] - sections = get_all_links_from_sections(self.index_to_soup) - for section_id, article_list in sections.items(): - self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list))) - articles = [] - for article_info in article_list: - self.log("Adding {0} to feed".format(article_info[0])) - articles.append({'title': article_info[1], 'url': article_info[0], - 'description': '', 'date': ""}) - self.log("Appending {0:d} articles for {1}".format(len(articles), section_id)) - feeds.append((section_id, articles)) - self.log('finished parse_index: ', time.strftime(self.timestampfmt)) - return feeds - - def preprocess_html(self, soup): - return soup diff --git a/recipes/icons/houston_chronicle.png b/recipes/icons/houston_chronicle.png deleted file mode 100644 index 2c4f5d366e..0000000000 Binary files a/recipes/icons/houston_chronicle.png and /dev/null differ diff --git a/recipes/icons/poetrymagazine.png b/recipes/icons/poetrymagazine.png deleted file mode 100644 index c4869ffd9a..0000000000 Binary files a/recipes/icons/poetrymagazine.png and /dev/null differ diff --git a/recipes/icons/ultimahora.png b/recipes/icons/ultimahora.png deleted file mode 100644 index 9fc6785752..0000000000 Binary files a/recipes/icons/ultimahora.png and /dev/null differ diff --git a/recipes/icons/unica.png b/recipes/icons/unica.png deleted file mode 100644 index 5a2607c8ed..0000000000 Binary files a/recipes/icons/unica.png and /dev/null differ diff --git a/recipes/icons/united_daily.png b/recipes/icons/united_daily.png deleted file mode 100644 index 90598e490d..0000000000 Binary files a/recipes/icons/united_daily.png and /dev/null differ diff --git a/recipes/icons/unperiodico.png b/recipes/icons/unperiodico.png deleted file mode 100644 index b7dca0d57a..0000000000 Binary files a/recipes/icons/unperiodico.png and /dev/null differ diff --git a/recipes/icons/valbybladet_dk.png b/recipes/icons/valbybladet_dk.png deleted file mode 100644 index 6fc915ca3d..0000000000 Binary files a/recipes/icons/valbybladet_dk.png and /dev/null differ diff --git a/recipes/icons/vanloesebladet_dk.png b/recipes/icons/vanloesebladet_dk.png deleted file mode 100644 index 6fc915ca3d..0000000000 Binary files a/recipes/icons/vanloesebladet_dk.png and /dev/null differ diff --git a/recipes/icons/vardelokalavisen_dk.png b/recipes/icons/vardelokalavisen_dk.png deleted file mode 100644 index 70464fce89..0000000000 Binary files a/recipes/icons/vardelokalavisen_dk.png and /dev/null differ diff --git a/recipes/icons/veintitres.png b/recipes/icons/veintitres.png deleted file mode 100644 index 9afc903d94..0000000000 Binary files a/recipes/icons/veintitres.png and /dev/null differ diff --git a/recipes/icons/vejlelokalavisen_dk.png b/recipes/icons/vejlelokalavisen_dk.png deleted file mode 100644 index 70464fce89..0000000000 Binary files a/recipes/icons/vejlelokalavisen_dk.png and /dev/null differ diff --git a/recipes/icons/vesterbrobladet_dk.png b/recipes/icons/vesterbrobladet_dk.png deleted file mode 100644 index 6fc915ca3d..0000000000 Binary files a/recipes/icons/vesterbrobladet_dk.png and /dev/null differ diff --git a/recipes/icons/vfr_magazine.png b/recipes/icons/vfr_magazine.png deleted file mode 100644 index aa7a668a50..0000000000 Binary files a/recipes/icons/vfr_magazine.png and /dev/null differ diff --git a/recipes/icons/vice.png b/recipes/icons/vice.png deleted file mode 100644 index 06ab0b8a77..0000000000 Binary files a/recipes/icons/vice.png and /dev/null differ diff --git a/recipes/icons/vice_magazine.png b/recipes/icons/vice_magazine.png deleted file mode 100644 index 53a6516b27..0000000000 Binary files a/recipes/icons/vice_magazine.png and /dev/null differ diff --git a/recipes/icons/vice_magazine_de.png b/recipes/icons/vice_magazine_de.png deleted file mode 100644 index 53a6516b27..0000000000 Binary files a/recipes/icons/vice_magazine_de.png and /dev/null differ diff --git a/recipes/icons/vijesti.png b/recipes/icons/vijesti.png deleted file mode 100644 index be3da20413..0000000000 Binary files a/recipes/icons/vijesti.png and /dev/null differ diff --git a/recipes/icons/virtualshackles.png b/recipes/icons/virtualshackles.png deleted file mode 100644 index b8cee80363..0000000000 Binary files a/recipes/icons/virtualshackles.png and /dev/null differ diff --git a/recipes/icons/viva.png b/recipes/icons/viva.png deleted file mode 100644 index 5456e393eb..0000000000 Binary files a/recipes/icons/viva.png and /dev/null differ diff --git a/recipes/icons/vnexpress.png b/recipes/icons/vnexpress.png deleted file mode 100644 index 8fbaf88371..0000000000 Binary files a/recipes/icons/vnexpress.png and /dev/null differ diff --git a/recipes/icons/voetbal_belgie.png b/recipes/icons/voetbal_belgie.png deleted file mode 100644 index 4715132f25..0000000000 Binary files a/recipes/icons/voetbal_belgie.png and /dev/null differ diff --git a/recipes/icons/voice_of_america.png b/recipes/icons/voice_of_america.png deleted file mode 100644 index 18e35f0395..0000000000 Binary files a/recipes/icons/voice_of_america.png and /dev/null differ diff --git a/recipes/icons/vrijnederland.png b/recipes/icons/vrijnederland.png deleted file mode 100644 index 6fe2f9649f..0000000000 Binary files a/recipes/icons/vrijnederland.png and /dev/null differ diff --git a/recipes/icons/wallstreetro.png b/recipes/icons/wallstreetro.png deleted file mode 100644 index 8af51dcfd9..0000000000 Binary files a/recipes/icons/wallstreetro.png and /dev/null differ diff --git a/recipes/icons/wapo_cartoons.png b/recipes/icons/wapo_cartoons.png deleted file mode 100644 index 83ab3c3418..0000000000 Binary files a/recipes/icons/wapo_cartoons.png and /dev/null differ diff --git a/recipes/icons/wikinews_en.png b/recipes/icons/wikinews_en.png deleted file mode 100644 index 5251d897d0..0000000000 Binary files a/recipes/icons/wikinews_en.png and /dev/null differ diff --git a/recipes/icons/winsupersite.png b/recipes/icons/winsupersite.png deleted file mode 100644 index d4c5442770..0000000000 Binary files a/recipes/icons/winsupersite.png and /dev/null differ diff --git a/recipes/icons/wolne_media.png b/recipes/icons/wolne_media.png deleted file mode 100644 index ecc45d4a86..0000000000 Binary files a/recipes/icons/wolne_media.png and /dev/null differ diff --git a/recipes/icons/workers_world.png b/recipes/icons/workers_world.png deleted file mode 100644 index 25cc936a50..0000000000 Binary files a/recipes/icons/workers_world.png and /dev/null differ diff --git a/recipes/icons/wvhooligan.png b/recipes/icons/wvhooligan.png deleted file mode 100644 index f3f44f2b4d..0000000000 Binary files a/recipes/icons/wvhooligan.png and /dev/null differ diff --git a/recipes/icons/yakima_herald.png b/recipes/icons/yakima_herald.png deleted file mode 100644 index 7e8de676aa..0000000000 Binary files a/recipes/icons/yakima_herald.png and /dev/null differ diff --git a/recipes/icons/yazihane.png b/recipes/icons/yazihane.png deleted file mode 100644 index 05290b90c4..0000000000 Binary files a/recipes/icons/yazihane.png and /dev/null differ diff --git a/recipes/icons/yementimes.png b/recipes/icons/yementimes.png deleted file mode 100644 index b74ecc420b..0000000000 Binary files a/recipes/icons/yementimes.png and /dev/null differ diff --git a/recipes/icons/yomiuri.png b/recipes/icons/yomiuri.png deleted file mode 100644 index a747d7df33..0000000000 Binary files a/recipes/icons/yomiuri.png and /dev/null differ diff --git a/recipes/icons/zaman.png b/recipes/icons/zaman.png deleted file mode 100644 index 11a741f2fa..0000000000 Binary files a/recipes/icons/zaman.png and /dev/null differ diff --git a/recipes/icons/zaufana_trzecia_strona.png b/recipes/icons/zaufana_trzecia_strona.png deleted file mode 100644 index dda45170e7..0000000000 Binary files a/recipes/icons/zaufana_trzecia_strona.png and /dev/null differ diff --git a/recipes/icons/zaxid_net.png b/recipes/icons/zaxid_net.png deleted file mode 100644 index 52f499e857..0000000000 Binary files a/recipes/icons/zaxid_net.png and /dev/null differ diff --git a/recipes/icons/zdnet.png b/recipes/icons/zdnet.png deleted file mode 100644 index 1a8038fd00..0000000000 Binary files a/recipes/icons/zdnet.png and /dev/null differ diff --git a/recipes/icons/zerocalcare.png b/recipes/icons/zerocalcare.png deleted file mode 100644 index 983a91c3ae..0000000000 Binary files a/recipes/icons/zerocalcare.png and /dev/null differ diff --git a/recipes/icons/zita_be.png b/recipes/icons/zita_be.png deleted file mode 100644 index d3a9af6829..0000000000 Binary files a/recipes/icons/zita_be.png and /dev/null differ diff --git a/recipes/icons/zougla.png b/recipes/icons/zougla.png deleted file mode 100644 index 95461dd865..0000000000 Binary files a/recipes/icons/zougla.png and /dev/null differ diff --git a/recipes/poetrymagazine.recipe b/recipes/poetrymagazine.recipe deleted file mode 100644 index 0aeae8c4f3..0000000000 --- a/recipes/poetrymagazine.recipe +++ /dev/null @@ -1,135 +0,0 @@ -import re -from collections import OrderedDict -from urllib.parse import urlparse - -from calibre.web.feeds.news import BasicNewsRecipe - -_issue_url = "" - -COMMA_SEP_RE = re.compile(r"\s*,\s*") -SPACE_SEP_RE = re.compile(r"\s+") -NON_NUMERIC_RE = re.compile(r"[^\d]+") - - -class Poetry(BasicNewsRecipe): - title = "Poetry Magazine" - __author__ = "ping" - description = ( - "Founded in Chicago by Harriet Monroe in 1912, Poetry is the oldest monthly " - "devoted to verse in the English-speaking world. https://www.poetryfoundation.org/poetrymagazine" - ) - publication_type = "magazine" - language = "en" - encoding = "utf-8" - remove_javascript = True - no_stylesheets = True - auto_cleanup = False - ignore_duplicate_articles = {"url"} - compress_news_images = False - - remove_attributes = ["style", "font"] - keep_only_tags = [dict(name="article")] - - remove_tags = [ - dict(name="button"), - dict( - attrs={ - "class": [ - "c-socialBlocks", - "c-index", - "o-stereo", - "u-hideAboveSmall", - "c-slideTrigger", - "js-slideshow", - ] - } - ), - ] - - extra_css = """ - h1 { font-size: 1.8rem; margin-bottom: 0.5rem; } - .o-titleBar-summary { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; } - div.o-titleBar-meta, div.c-feature-sub { font-weight: bold; color: #444; margin-bottom: 1.5rem; } - div.pcms_media img, div.o-mediaEnclosure img { max-width: 100%; height: auto; } - div.o-mediaEnclosure .o-mediaEnclosure-metadata { font-size: 0.8rem; margin-top: 0.2rem; } - div.c-feature-bd { margin-bottom: 2rem; } - div.c-auxContent { color: #222; font-size: 0.85rem; margin-top: 2rem; } - """ - - def extract_from_img_srcset(self, srcset: str, max_width=0): - sources = [s.strip() for s in COMMA_SEP_RE.split(srcset) if s.strip()] - if len(sources) == 1: - # just a regular img url probably - return sources[0] - parsed_sources = [] - for src in sources: - src_n_width = [s.strip() for s in SPACE_SEP_RE.split(src) if s.strip()] - if len(src_n_width) != 2: - raise ValueError(f"Not a valid srcset: {srcset}") - parsed_sources.append( - ( - src_n_width[0].strip(), - int(NON_NUMERIC_RE.sub("", src_n_width[1].strip())), - ) - ) - parsed_sources = list(set(parsed_sources)) - parsed_sources = sorted(parsed_sources, key=lambda x: x[1], reverse=True) - if not max_width: - return parsed_sources[0][0] - for img, width in parsed_sources: - if width <= max_width: - return img - return parsed_sources[-1][0] - - def preprocess_html(self, soup): - for img in soup.select("div.o-mediaEnclosure img"): - if not img.get("srcset"): - continue - img["src"] = self.extract_from_img_srcset(img["srcset"], max_width=1000) - return soup - - def parse_index(self): - if _issue_url: - soup = self.index_to_soup(_issue_url) - else: - soup = self.index_to_soup("https://www.poetryfoundation.org/poetrymagazine") - current_issue = soup.select("div.c-cover-media a") - if not current_issue: - self.abort_recipe_processing("Unable to find latest issue") - current_issue = current_issue[0] - soup = self.index_to_soup(current_issue["href"]) - - issue_edition = self.tag_to_string(soup.find("h1")) - self.timefmt = f" [{issue_edition}]" - cover_image = soup.select("div.c-issueBillboard-cover-media img")[0] - parsed_cover_url = urlparse( - cover_image["srcset"].split(",")[-1].strip().split(" ")[0] - ) - self.cover_url = f"{parsed_cover_url.scheme}://{parsed_cover_url.netloc}{parsed_cover_url.path}" - - sectioned_feeds = OrderedDict() - - tabs = soup.find_all("div", attrs={"class": "c-tier_tabbed"}) - for tab in tabs: - tab_title = tab.find("div", attrs={"class": "c-tier-tab"}) - tab_content = tab.find("div", attrs={"class": "c-tier-content"}) - if not (tab_title and tab_content): - continue - tab_title = self.tag_to_string(tab_title) - sectioned_feeds[tab_title] = [] - for li in tab_content.select("ul.o-blocks > li"): - author = self.tag_to_string( - li.find("span", attrs={"class": "c-txt_attribution"}) - ) - for link in li.find_all("a", attrs={"class": "c-txt_abstract"}): - self.log("Found article:", self.tag_to_string(link)) - sectioned_feeds[tab_title].append( - { - "title": self.tag_to_string(link), - "url": link["href"], - "author": author, - "description": author, - } - ) - - return sectioned_feeds.items() diff --git a/recipes/ultimahora.recipe b/recipes/ultimahora.recipe deleted file mode 100644 index e2b16570b7..0000000000 --- a/recipes/ultimahora.recipe +++ /dev/null @@ -1,52 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' -''' -ultimahora.com -''' - -from calibre.web.feeds.news import BasicNewsRecipe - - -class UltimaHora_py(BasicNewsRecipe): - title = 'Ultima Hora' - __author__ = 'Darko Miletic' - description = 'Noticias de Paraguay y el resto del mundo' - publisher = 'EDITORIAL EL PAIS S.A.' - category = 'news, politics, Paraguay' - oldest_article = 2 - max_articles_per_feed = 200 - no_stylesheets = True - encoding = 'cp1252' - use_embedded_content = False - language = 'es_PY' - remove_empty_feeds = True - publication_type = 'newspaper' - masthead_url = 'http://www.ultimahora.com/imgs/uh-com.gif' - extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .sub_titulo_mediano,.TituloNota{font-family: Georgia,"Times New Roman",Times,serif} .sub_titulo_mediano{font-weight: bold} ' # noqa - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } - - remove_tags = [ - dict(name=['form', 'iframe', 'embed', 'object', 'link', 'base', 'table'])] - keep_only_tags = [ - dict(attrs={'id': ['nota_titulo', 'nota_copete', 'texto']})] - - feeds = [ - - (u'Arte y Espectaculos', u'http://www.ultimahora.com/adjuntos/rss/UHEspectaculos.xml'), - (u'Ciudad del Este', u'http://www.ultimahora.com/adjuntos/rss/UHCDE.xml'), - (u'Deportes', u'http://www.ultimahora.com/adjuntos/rss/UHDeportes.xml'), - (u'Ultimo momento', u'http://www.ultimahora.com/adjuntos/rss/UltimoMomento.xml'), - (u'Nacionales', u'http://www.ultimahora.com/adjuntos/rss/uh-rss-nacionales.xml'), - (u'Politica', u'http://www.ultimahora.com/adjuntos/rss/uh-rss-politica.xml'), - (u'Sucesos', u'http://www.ultimahora.com/adjuntos/rss/uh-rss-sucesos.xml'), - (u'Economia', u'http://www.ultimahora.com/adjuntos/rss/uh-rss-economia.xml'), - (u'Ciencia y Tecnologia', u'http://www.ultimahora.com/adjuntos/rss/uh-rss-ciencia.xml') - ] - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - return soup diff --git a/recipes/unica.recipe b/recipes/unica.recipe deleted file mode 100644 index 9f1a117ae5..0000000000 --- a/recipes/unica.recipe +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -__license__ = 'GPL v3' -__copyright__ = u'2011, Silviu Cotoar\u0103' -''' -unica.ro -''' - -from calibre.web.feeds.news import BasicNewsRecipe - - -class Unica(BasicNewsRecipe): - title = u'Unica' - __author__ = u'Silviu Cotoar\u0103' - description = 'Asa cum esti tu' - publisher = 'Unica' - oldest_article = 5 - language = 'ro' - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - category = 'Ziare,Reviste,Femei' - encoding = 'utf-8' - cover_url = 'http://www.unica.ro/fileadmin/images/logo.gif' - - conversion_options = { - 'comments': description, 'tags': category, 'language': language, 'publisher': publisher - } - - keep_only_tags = [ - dict(name='div', attrs={'id': 'sticky'}), dict( - name='p', attrs={'class': 'bodytext'}) - - ] - - remove_tags = [ - dict(name='div', attrs={'class': ['top-links']}), dict(name='div', attrs={'id': ['autor_name']}), dict(name='div', attrs={ - 'class': ['box-r']}), dict(name='div', attrs={'class': ['category']}), dict(name='div', attrs={'class': ['data']}) - ] - - remove_tags_after = [ - dict(name='ul', attrs={'class': 'pager'}) - ] - - feeds = [ - (u'Feeds', u'http://www.unica.ro/rss.html') - ] - - def preprocess_html(self, soup): - return self.adeify_images(soup) diff --git a/recipes/united_daily.recipe b/recipes/united_daily.recipe deleted file mode 100644 index 3a10566772..0000000000 --- a/recipes/united_daily.recipe +++ /dev/null @@ -1,86 +0,0 @@ -# -*- coding: utf-8 -*- -__license__ = 'GPL v3' - -from calibre.web.feeds.news import BasicNewsRecipe - - -class UnitedDaily(BasicNewsRecipe): - title = u'聯合新聞網' - oldest_article = 1 - max_articles_per_feed = 100 - - feeds = [(u'焦點', u'http://udn.com/udnrss/focus.xml'), - (u'政治', u'http://udn.com/udnrss/politics.xml'), - (u'社會', u'http://udn.com/udnrss/social.xml'), - (u'生活', u'http://udn.com/udnrss/life.xml'), - (u'綜合', u'http://udn.com/udnrss/education.xml'), - (u'意見評論', u'http://udn.com/udnrss/opinion.xml'), - (u'校園博覽會', u'http://mag.udn.com/udnrss/campus_rss.xml'), - (u'大台北', u'http://udn.com/udnrss/local_taipei.xml'), - (u'桃竹苗', u'http://udn.com/udnrss/local_tyhcml.xml'), - (u'中彰投', u'http://udn.com/udnrss/local_tcchnt.xml'), - (u'雲嘉南', u'http://udn.com/udnrss/local_ylcytn.xml'), - (u'高屏離島', u'http://udn.com/udnrss/local_ksptisland.xml'), - (u'基宜花東', u'http://udn.com/udnrss/local_klilhltt.xml'), - (u'台灣百寶鄉', u'http://udn.com/udnrss/local_oddlyenough.xml'), - (u'台灣人物', u'http://mag.udn.com/udnrss/people_rss.xml'), - (u'兩岸要聞', u'http://udn.com/udnrss/mainland.xml'), - (u'國際焦點', u'http://udn.com/udnrss/international.xml'), - (u'台商經貿', u'http://udn.com/udnrss/financechina.xml'), - (u'國際財經', u'http://udn.com/udnrss/financeworld.xml'), - (u'全球觀察', u'http://mag.udn.com/udnrss/world_rss.xml'), - (u'財經焦點', u'http://udn.com/udnrss/financesfocus.xml'), - (u'股市要聞', u'http://udn.com/udnrss/stock.xml'), - (u'股市快訊', u'http://udn.com/udnrss/stklatest.xml'), - (u'稅務法務', u'http://udn.com/udnrss/tax.xml'), - (u'房市情報', u'http://udn.com/udnrss/houses.xml'), - (u'個人理財', u'http://mag.udn.com/udnrss/wealth_rss.xml'), - (u'研究報告', u'http://mag.udn.com/udnrss/report_rss.xml'), - (u'基金', u'http://mag.udn.com/udnrss/fund_rss.xml'), - (u'理財會客室', u'http://mag.udn.com/udnrss/m_forum_rss.xml'), - (u'棒球', u'http://udn.com/udnrss/baseball.xml'), - (u'籃球', u'http://udn.com/udnrss/basketball.xml'), - (u'體壇動態', u'http://udn.com/udnrss/sportsfocus.xml'), - (u'熱門星聞', u'http://udn.com/udnrss/starsfocus.xml'), - (u'廣電港陸', u'http://udn.com/udnrss/tv.xml'), - (u'海外星球', u'http://udn.com/udnrss/starswestern.xml'), - (u'日韓星情', u'http://udn.com/udnrss/starsjk.xml'), - (u'電影世界', u'http://udn.com/udnrss/movie.xml'), - (u'流行音樂', u'http://udn.com/udnrss/music.xml'), - (u'觀點專題', u'http://udn.com/udnrss/starssubject.xml'), - (u'消費流行', u'http://mag.udn.com/udnrss/happylife_rss.xml'), - (u'食樂指南', u'http://udn.com/udnrss/food.xml'), - (u'數位資訊', u'http://mag.udn.com/udnrss/digital_rss.xml'), - (u'折扣好康', u'http://udn.com/udnrss/shopping.xml'), - (u'發燒車訊', u'http://mag.udn.com/udnrss/car_rss.xml'), - (u'醫藥新聞', u'http://udn.com/udnrss/health.xml'), - (u'家婦繽紛', u'http://udn.com/udnrss/benfen.xml'), - (u'談星論命', u'http://udn.com/udnrss/astrology.xml'), - (u'文化副刊', u'http://udn.com/udnrss/reading.xml'), - (u'旅遊休閒', u'http://travel.udn.com/udnrss/travel_rss.xml'), - (u'健康醫藥', u'http://mag.udn.com/udnrss/life_rss.xml'), - ] - - extra_css = '''div[id='story_title'] {font-size:200%; font-weight:bold;} td[class='story_title'] {font-size:200%; font-weight:bold;} td[class='story_title'] td[class='story_title']>div {font-size:200%; font-weight:bold;}''' # noqa - - __author__ = 'Eddie Lau' - __version__ = '1.2' - language = 'zh_TW' - publisher = 'United Daily News Group' - description = 'United Daily (Taiwan)' - category = 'News, Chinese, Taiwan' - remove_javascript = True - use_embedded_content = False - no_stylesheets = True - encoding = 'utf-8' - conversion_options = {'linearize_tables': True} - masthead_url = 'http://udn.com/NEWS/2004/images/logo_udn.gif' - cover_url = 'http://udn.com/NEWS/2004/images/logo_udn.gif' - auto_cleanup = True - # keep_only_tags = [dict(name='td', attrs={'class':['story_title']}), - # dict(name='div', attrs={'id':['story_title']}), - # dict(name='td', attrs={'class':['story_author']}), - # dict(name='div', attrs={'id':['story_author']}), - # dict(name='td', attrs={'class':['story']}), - # dict(name='div', attrs={'id':['story']}), - # ] diff --git a/recipes/unperiodico.recipe b/recipes/unperiodico.recipe deleted file mode 100644 index 2c9be63762..0000000000 --- a/recipes/unperiodico.recipe +++ /dev/null @@ -1,22 +0,0 @@ -# -*- coding: utf-8 -*- -# https://github.com/iemejia/calibrecolombia - -''' -http://www.unperiodico.unal.edu.co/ -''' - -from calibre.web.feeds.news import BasicNewsRecipe - - -class UNPeriodico(BasicNewsRecipe): - title = u'UN Periodico' - language = 'es_CO' - __author__ = 'Ismael Mejia ' - cover_url = 'http://www.unperiodico.unal.edu.co/fileadmin/templates/periodico/img/logoperiodico.png' - description = 'UN Periodico' - oldest_article = 30 - max_articles_per_feed = 100 - publication_type = 'newspaper' - feeds = [ - (u'UNPeriodico', u'http://www.unperiodico.unal.edu.co/rss/type/rss2/') - ] diff --git a/recipes/utrinski.recipe b/recipes/utrinski.recipe deleted file mode 100644 index 79efed89f9..0000000000 --- a/recipes/utrinski.recipe +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python - -__author__ = 'Darko Spasovski' -__license__ = 'GPL v3' -__copyright__ = '2011, Darko Spasovski ' -''' -utrinski.com.mk -''' - -import datetime -import re - -from calibre import browser -from calibre.ebooks.BeautifulSoup import BeautifulSoup -from calibre.web.feeds.news import BasicNewsRecipe - - -class UtrinskiVesnik(BasicNewsRecipe): - - INDEX = 'http://www.utrinski.com.mk/' - title = 'Utrinski Vesnik' - description = 'Daily Macedonian newspaper' - masthead_url = 'http://www.utrinski.com.mk/images/LogoTop.jpg' - language = 'mk' - remove_javascript = True - publication_type = 'newspaper' - category = 'news, Macedonia' - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in - [ - # Remove anything before the start of the article. - (r'', lambda match: ''), - - # Remove anything after the end of the article. - (r'