American Prospect, FactCheck, PolitiFact by Michael Heinz

2025-07-09 03:04:10 -04:00 · 2010-05-18 18:03:16 -06:00 · 2010-05-18 18:03:16 -06:00 · b1287f0a51
commit b1287f0a51
parent e1b988598c
6 changed files with 82 additions and 11 deletions
--- a/resources/recipes/aprospect.recipe
+++ b/resources/recipes/aprospect.recipe
@ -0,0 +1,26 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AmericanProspect(BasicNewsRecipe):
+    title          = u'American Prospect'
+    __author__     = u'Michael Heinz'
+    oldest_article = 30
+    language = 'en'
+    max_articles_per_feed = 100
+    recursions = 0
+    no_stylesheets = True
+    remove_javascript = True
+
+    preprocess_regexps = [
+        (re.compile(r'<body.*?<div class="pad_10L10R">', re.DOTALL|re.IGNORECASE), lambda match: '<body><div>'),
+        (re.compile(r'</div>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</div></body>'),
+        (re.compile('\r'),lambda match: ''),
+        (re.compile(r'<!-- .+? -->', re.DOTALL|re.IGNORECASE), lambda match: ''),
+        (re.compile(r'<link .+?>', re.DOTALL|re.IGNORECASE), lambda match: ''),
+        (re.compile(r'<script.*?</script>', re.DOTALL|re.IGNORECASE), lambda match: ''),
+        (re.compile(r'<noscript.*?</noscript>', re.DOTALL|re.IGNORECASE), lambda match: ''),
+        (re.compile(r'<meta .*?/>', re.DOTALL|re.IGNORECASE), lambda match: ''),
+    ]
+
+    feeds       = [(u'Articles', u'feed://www.prospect.org/articles_rss.jsp')]
+
--- a/resources/recipes/factcheck.recipe
+++ b/resources/recipes/factcheck.recipe
@ -0,0 +1,19 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class FactCheckOrg(BasicNewsRecipe):
+    title          = u'Factcheck'
+    __author__     = u'Michael Heinz'
+    language = 'en'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    recursion = 0
+
+    publication_type = 'magazine'
+    masthead_url = 'http://factcheck.org/wp-content/themes/Streamline/images/headernew.jpg'
+    cover_url = 'http://factcheck.org/wp-content/themes/Streamline/images/headernew.jpg'
+
+    remove_tags = [ dict({'id':['footer','footerabout','sidebar']}) ]
+
+
+    feeds          = [(u'Factcheck', u'feed://www.factcheck.org/feed/')]
+
--- a/resources/recipes/politifact.recipe
+++ b/resources/recipes/politifact.recipe
@ -0,0 +1,30 @@
+from calibre.wb.feeds.news import BasicNewsRecipe
+
+class PolitiFactCom(BasicNewsRecipe):
+    title          = u'Politifact'
+    __author__     = u'Michael Heinz'
+    oldest_article = 21
+    max_articles_per_feed = 100
+    recursion = 0
+    language = 'en'
+
+    no_stylesheets = True
+
+    publication_type = 'magazine'
+    masthead_url = 'http://static.politifact.com.s3.amazonaws.com/images/politifactdotcom-flag-fff_01.png'
+    cover_url = 'http://static.politifact.com.s3.amazonaws.com/images/politifactdotcom-flag-fff_01.png'
+
+    remove_tags = [
+                     dict(name='div', attrs={'class':'pfstoryarchive'}),
+                     dict(name='div', attrs={'class':'pfhead'}),
+                     dict(name='div', attrs={'class':'boxmid'}),
+                  ]
+
+    keep_only_tags = [dict(name='div', attrs={'class':'pfcontentleft'})]
+    feeds          = [
+                     (u'Articles', u'http://www.politifact.com/feeds/articles/truth-o-meter/'),
+                     (u'Obamameter', u'http://politifact.com/feeds/updates/'),
+                     (u'Statements', u'http://www.politifact.com/feeds/statements/truth-o-meter/')
+                     ]
+
+
--- a/src/calibre/devices/manager.py
+++ b/src/calibre/devices/manager.py
@ -34,6 +34,6 @@ class Worker(threading.Thread):
    def run(self):
        '''Thread loops taking jobs from the queue as they become available'''
        while True:
-            job = self.jobs.get(True, None)
+            self.jobs.get(True, None)
            # Do job
            self.jobs.task_done()
--- a/src/calibre/ebooks/rtf2xml/paragraph_def.py
+++ b/src/calibre/ebooks/rtf2xml/paragraph_def.py
@ -354,7 +354,6 @@ if another paragraph_def is found, the state changes to collect_tokens.
    def __tab_stop_func(self, line):
        """
        """
-        type = 'tabs-%s' % self.__tab_type
        self.__att_val_dict['tabs'] += '%s:' % self.__tab_type
        self.__att_val_dict['tabs'] += '%s;' % line[20:-1]
        self.__tab_type = 'left'
@ -373,7 +372,6 @@ if another paragraph_def is found, the state changes to collect_tokens.
        """
        leader = self.__tab_type_dict.get(self.__token_info)
        if leader != None:
-            type = 'tabs-%s' % self.__tab_type
            self.__att_val_dict['tabs'] += '%s^' % leader
        else:
            if self.__run_level > 3:
--- a/src/calibre/ebooks/rtf2xml/styles.py
+++ b/src/calibre/ebooks/rtf2xml/styles.py
@ -318,7 +318,6 @@ class Styles:
            Try to add the number to dictionary entry tabs-left, or tabs-right, etc.
            If the dictionary entry doesn't exist, create one.
        """
-        type = 'tabs-%s' % self.__tab_type
        try:
            if self.__leader_found:
                self.__styles_dict['par'][self.__styles_num]['tabs']\
@ -362,7 +361,6 @@ class Styles:
        leader = self.__tab_type_dict.get(self.__token_info)
        if leader != None:
            leader += '^'
-            type = 'tabs-%s' % self.__tab_type
            try:
                self.__styles_dict['par'][self.__styles_num]['tabs'] += ':%s;' % leader
            except KeyError: