diff --git a/recipes/bbc.recipe b/recipes/bbc.recipe index 2bccbaf4ae..9b2d4854bb 100644 --- a/recipes/bbc.recipe +++ b/recipes/bbc.recipe @@ -1,61 +1,648 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' +## +## Title: BBC News, Sport, and Blog Calibre Recipe +## Contact: mattst - jmstanfield@gmail.com +## +## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html +## Copyright: mattst - jmstanfield@gmail.com +## +## Written: November 2011 +## Last Edited: 2011-11-19 +## + +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' +__copyright__ = 'mattst - jmstanfield@gmail.com' + + ''' -news.bbc.co.uk +BBC News, Sport, and Blog Calibre Recipe ''' + +# Import the regular expressions module. import re + +# Import the BasicNewsRecipe class which this class extends. from calibre.web.feeds.recipes import BasicNewsRecipe -class BBC(BasicNewsRecipe): - title = 'BBC News' - __author__ = 'Darko Miletic, Starson17' - description = 'News from UK. ' - oldest_article = 2 - max_articles_per_feed = 100 - no_stylesheets = True - #delay = 1 - use_embedded_content = False - encoding = 'utf8' - publisher = 'BBC' - category = 'news, UK, world' - language = 'en_GB' - publication_type = 'newsportal' - extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' - preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] - conversion_options = { - 'comments' : description - ,'tags' : category - ,'language' : language - ,'publisher' : publisher - ,'linearize_tables': True +class BBCNewsSportBlog(BasicNewsRecipe): + + # + # **** IMPORTANT USERS READ ME **** + # + # First select the feeds you want then scroll down below the feeds list + # and select the values you want for the other user preferences, like + # oldest_article and such like. + # + # + # Select the BBC rss feeds which you want in your ebook. + # Selected feed have NO '#' at their start, de-selected feeds begin with a '#'. + # + # Eg. ("News Home", "http://feeds.bbci.co.uk/... - include feed. + # Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed. + # + # There are 68 feeds below which constitute the bulk of the available rss + # feeds on the BBC web site. These include 5 blogs by editors and + # correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West + # Wales, Scotland Business), and 7 Welsh language feeds. + # + # Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click) + # so if "oldest_article = 1.5" (only articles published in the last 36 hours) + # you may get some 'empty feeds' which will not then be included in the ebook. + # + # The 15 feeds currently selected below are simply my default ones. + # + # Note: With all 68 feeds selected, oldest_article set to 2, + # max_articles_per_feed set to 100, and simultaneous_downloads set to 10, + # the ebook creation took 29 minutes on my speedy 100 mbps net connection, + # fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx). + # More realistically with 15 feeds selected, oldest_article set to 1.5, + # max_articles_per_feed set to 100, and simultaneous_downloads set to 20, + # it took 6 minutes. If that's too slow increase 'simultaneous_downloads'. + # + # Select / de-select the feeds you want in your ebook. + # + feeds = [ + ("News Home", "http://feeds.bbci.co.uk/news/rss.xml"), + ("UK", "http://feeds.bbci.co.uk/news/uk/rss.xml"), + ("World", "http://feeds.bbci.co.uk/news/world/rss.xml"), + #("England", "http://feeds.bbci.co.uk/news/england/rss.xml"), + #("Scotland", "http://feeds.bbci.co.uk/news/scotland/rss.xml"), + #("Wales", "http://feeds.bbci.co.uk/news/wales/rss.xml"), + #("N. Ireland", "http://feeds.bbci.co.uk/news/northern_ireland/rss.xml"), + #("Africa", "http://feeds.bbci.co.uk/news/world/africa/rss.xml"), + #("Asia", "http://feeds.bbci.co.uk/news/world/asia/rss.xml"), + #("Europe", "http://feeds.bbci.co.uk/news/world/europe/rss.xml"), + #("Latin America", "http://feeds.bbci.co.uk/news/world/latin_america/rss.xml"), + #("Middle East", "http://feeds.bbci.co.uk/news/world/middle_east/rss.xml"), + ("US & Canada", "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"), + ("Politics", "http://feeds.bbci.co.uk/news/politics/rss.xml"), + ("Science/Environment", "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml"), + ("Technology", "http://feeds.bbci.co.uk/news/technology/rss.xml"), + ("Magazine", "http://feeds.bbci.co.uk/news/magazine/rss.xml"), + ("Entertainment/Arts", "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"), + #("Health", "http://feeds.bbci.co.uk/news/health/rss.xml"), + #("Education/Family", "http://feeds.bbci.co.uk/news/education/rss.xml"), + ("Business", "http://feeds.bbci.co.uk/news/business/rss.xml"), + ("Special Reports", "http://feeds.bbci.co.uk/news/special_reports/rss.xml"), + ("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"), + #("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"), + #("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"), + ("Blog: Nick Robinson (Political Editor)", "http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"), + #("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"), + #("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"), + #("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"), + ("Blog: Rory Cellan-Jones (Technology correspondent)", "http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"), + ("Sport Front Page", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"), + #("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"), + #("Cricket", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"), + #("Rugby Union", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"), + #("Rugby League", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"), + #("Tennis", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"), + #("Golf", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"), + #("Motorsport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"), + #("Boxing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"), + #("Athletics", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"), + #("Snooker", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"), + #("Horse Racing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"), + #("Cycling", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"), + #("Disability Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"), + #("Other Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"), + #("Olympics 2012", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"), + #("N. Ireland Politics", "http://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"), + #("Scotland Politics", "http://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"), + #("Scotland Business", "http://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"), + #("E. Scotland, Edinburgh & Fife", "http://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"), + #("W. Scotland & Glasgow", "http://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"), + #("Highlands & Islands", "http://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"), + #("NE. Scotland, Orkney & Shetland", "http://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"), + #("South Scotland", "http://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"), + #("Central Scotland & Tayside", "http://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"), + #("Wales Politics", "http://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"), + #("NW. Wales", "http://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"), + #("NE. Wales", "http://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"), + #("Mid. Wales", "http://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"), + #("SW. Wales", "http://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"), + #("SE. Wales", "http://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"), + #("Newyddion - News in Welsh", "http://feeds.bbci.co.uk/newyddion/rss.xml"), + #("Gwleidyddiaeth", "http://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"), + #("Gogledd-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"), + #("Gogledd-Orllewin", "http://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"), + #("Canolbarth", "http://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"), + #("De-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"), + #("De-Orllewin", "http://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"), + ] + + + # **** SELECT YOUR USER PREFERENCES **** + + # Title to use for the ebook. + # + title = 'BBC News' + + # A brief description for the ebook. + # + description = u'BBC web site ebook created using rss feeds.' + + # The max number of articles which may be downloaded from each feed. + # I've never seen more than about 70 articles in a single feed in the + # BBC feeds. + # + max_articles_per_feed = 100 + + # The max age of articles which may be downloaded from each feed. This is + # specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a + # half days). My default of 1.5 days is the last 36 hours, the point at + # which I've decided 'news' becomes 'old news', but be warned this is not + # so good for the blogs, technology, magazine, etc., and sports feeds. + # You may wish to extend this to 2-5 but watch out ebook creation time will + # increase as well. Setting this to 30 will get everything (AFAICT) as long + # as max_articles_per_feed remains set high (except for 'Click' which is + # v. low volume and its currently oldest article is 4th Feb 2011). + # + oldest_article = 1.5 + + # Number of simultaneous downloads. 20 is consistantly working fine on the + # BBC News feeds with no problems. Speeds things up from the defualt of 5. + # If you have a lot of feeds and/or have increased oldest_article above 2 + # then you may wish to try increasing simultaneous_downloads to 25-30, + # Or, of course, if you are in a hurry. [I've not tried beyond 20.] + # + simultaneous_downloads = 20 + + # Timeout for fetching files from the server in seconds. The default of + # 120 seconds, seems somewhat excessive. + # + timeout = 30 + + # The format string for the date shown on the ebook's first page. + # List of all values: http://docs.python.org/library/time.html + # Default in news.py has a leading space so that's mirrored here. + # As with 'feeds' select/de-select by adding/removing the initial '#', + # only one timefmt should be selected, here's a few to choose from. + # + timefmt = ' [%a, %d %b %Y]' # [Fri, 14 Nov 2011] (Calibre default) + #timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30] + #timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM] + #timefmt = ' [%d %b %Y]' # [14 Nov 2011] + #timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30] + #timefmt = ' [%Y-%m-%d]' # [2011-11-14] + #timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30] + + + + # + # **** IMPORTANT **** + # + # DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING. + # + # DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING. + # + # I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :) + # + # **** IMPORTANT **** + # + + + + # Author of this recipe. + __author__ = 'mattst' + + # Specify English as the language of the RSS feeds (ISO-639 code). + language = 'en_GB' + + # Set tags. + tags = 'news, sport, blog' + + # Set publisher and publication type. + publisher = 'BBC' + publication_type = 'newspaper' + + # Disable stylesheets from site. + no_stylesheets = True + + # Specifies an override encoding for sites that have an incorrect charset + # specified. Default of 'None' says to auto-detect. Some other BBC recipes + # use 'utf8', which works fine (so use that if necessary) but auto-detecting + # with None is working fine, so stick with that for robustness. + encoding = None + + # Sets whether a feed has full articles embedded in it. The BBC feeds do not. + use_embedded_content = False + + # Removes empty feeds - why keep them!? + remove_empty_feeds = True + + # Create a custom title which fits nicely in the Kindle title list. + # Requires "import time" above class declaration, and replacing + # title with custom_title in conversion_options (right column only). + # Example of string below: "BBC News - 14 Nov 2011" + # + # custom_title = "BBC News - " + time.strftime('%d %b %Y') + + ''' + # Conversion options for advanced users, but don't forget to comment out the + # current conversion_options below. Avoid setting 'linearize_tables' as that + # plays havoc with the 'old style' table based pages. + # + conversion_options = { 'title' : title, + 'comments' : description, + 'tags' : tags, + 'language' : language, + 'publisher' : publisher, + 'authors' : publisher, + 'smarten_punctuation' : True } + ''' - keep_only_tags = [ - dict(name='div', attrs={'class':['layout-block-a layout-block']}) - ,dict(attrs={'class':['story-body','storybody']}) - ] + conversion_options = { 'smarten_punctuation' : True } - remove_tags = [ - dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper', - 'story-feature wide ', 'story-feature narrow']}), - dict(id=['hypertab', 'comment-form']), - ] + # Specify extra CSS - overrides ALL other CSS (IE. Added last). + extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ + .introduction, .first { font-weight: bold; } \ + .cross-head { font-weight: bold; font-size: 125%; } \ + .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ + .cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \ + .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ + .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \ + text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \ + .story-date, .published { font-size: 80%; } \ + table { width: 100%; } \ + td img { display: block; margin: 5px auto; } \ + ul { padding-top: 10px; } \ + ol { padding-top: 10px; } \ + li { padding-top: 5px; padding-bottom: 5px; } \ + h1 { text-align: center; font-size: 175%; font-weight: bold; } \ + h2 { text-align: center; font-size: 150%; font-weight: bold; } \ + h3 { text-align: center; font-size: 125%; font-weight: bold; } \ + h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }' - remove_attributes = ['width','height'] + # Remove various tag attributes to improve the look of the ebook pages. + remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', + 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] - feeds = [ - ('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'), - ('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'), - ('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'), - ('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'), - ('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'), - ('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'), - ('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'), - ('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'), - ('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'), - ('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'), - ('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'), - ('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'), - ('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'), - ] + # Remove the (admittedly rarely used) line breaks, "
", which sometimes + # cause a section of the ebook to start in an unsightly fashion or, more + # frequently, a "
" will muck up the formatting of a correspondant's byline. + # "
" and "
" are far more frequently used on the table formatted + # style of pages, and really spoil the look of the ebook pages. + preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'', re.IGNORECASE), lambda m: '')] + + # Create regular expressions for tag keeping and removal to make the matches more + # robust against minor changes and errors in the HTML, Eg. double spaces, leading + # and trailing spaces, missing hyphens, and such like. + # Python regular expression ('re' class) page: http://docs.python.org/library/re.html + + # *************************************** + # Regular expressions for keep_only_tags: + # *************************************** + + # The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML + # page which contains the main text of the article. Match storybody variants: 'storybody', + # 'story-body', 'story body','storybody ', etc. + storybody_reg_exp = '^.*story[_ -]*body.*$' + + # The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title + # and published date. This is one level above the usual news pages which have the title + # and date within 'story-body'. This is annoying since 'blq_content' must also be kept, + # resulting in a lot of extra things to be removed by remove_tags. + blq_content_reg_exp = '^.*blq[_ -]*content.*$' + + # The BBC has an alternative page design structure, which I suspect is an out-of-date + # design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack' + # (travel), and in some sport pages. These alternative pages are table based (which is + # why I think they are an out-of-date design) and account for -I'm guesstimaking- less + # than 1% of all articles. They use a table class 'storycontent' to hold the article + # and like blq_content (above) have required lots of extra removal by remove_tags. + story_content_reg_exp = '^.*story[_ -]*content.*$' + + # Keep the sections of the HTML which match the list below. The HTML page created by + # Calibre will fill with those sections which are matched. Note that the + # blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to + # it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body' + # will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at + # all). If they are the other way around in keep_only_tags then blq_content_reg_exp + # will end up being discarded. + keep_only_tags = [ dict(name='table', attrs={'class':re.compile(story_content_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'class':re.compile(blq_content_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'id':re.compile(blq_content_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'id':re.compile(storybody_reg_exp, re.IGNORECASE)}) ] + + # ************************************ + # Regular expressions for remove_tags: + # ************************************ + + # Regular expression to remove share-help and variant tags. The share-help class + # is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious, + # twitter, email. Removed to avoid page clutter. + share_help_reg_exp = '^.*share[_ -]*help.*$' + + # Regular expression to remove embedded-hyper and variant tags. This class is used to + # display links to other BBC News articles on the same/similar subject. + embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$' + + # Regular expression to remove hypertabs and variant tags. This class is used to + # display a tab bar at the top of an article which allows the user to switch to + # an article (viewed on the same page) providing further info., 'in depth' analysis, + # an editorial, a correspondant's blog entry, and such like. The ability to handle + # a tab bar of this nature is currently beyond the scope of this recipe and + # possibly of Calibre itself (not sure about that - TO DO - check!). + hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$' + + # Regular expression to remove story-feature and variant tags. Eg. 'story-feature', + # 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'. + # This class is used to add additional info. boxes, or small lists, outside of + # the main story. TO DO: Work out a way to incorporate these neatly. + story_feature_reg_exp = '^.*story[_ -]*feature.*$' + + # Regular expression to remove video and variant tags, Eg. 'videoInStoryB', + # 'videoInStoryC'. This class is used to embed video. + video_reg_exp = '^.*video.*$' + + # Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'. + # This class is used to embed audio. + audio_reg_exp = '^.*audio.*$' + + # Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'. + # This class is used to embed a photo slideshow. See also 'slideshow' below. + picture_gallery_reg_exp = '^.*picture.*$' + + # Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'. + # This class is used to embed a slideshow (not necessarily photo) but both + # 'slideshow' and 'pictureGallery' are used for slideshows. + slideshow_reg_exp = '^.*slide[_ -]*show.*$' + + # Regular expression to remove social-links and variant tags. This class is used to + # display links to a BBC bloggers main page, used in various columnist's blogs + # (Eg. Nick Robinson, Robert Preston). + social_links_reg_exp = '^.*social[_ -]*links.*$' + + # Regular expression to remove quote and (multi) variant tags, Eg. 'quote', + # 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually + # removed by 'story-feature' removal (as they are usually within them), but + # not always. The quotation removed is always (AFAICT) in the article text + # as well but a 2nd copy is placed in a quote tag to draw attention to it. + # The quote class tags may or may not appear in div's. + quote_reg_exp = '^.*quote.*$' + + # Regular expression to remove hidden and variant tags, Eg. 'hidden'. + # The purpose of these is unclear, they seem to be an internal link to a + # section within the article, but the text of the link (Eg. 'Continue reading + # the main story') never seems to be displayed anyway. Removed to avoid clutter. + # The hidden class tags may or may not appear in div's. + hidden_reg_exp = '^.*hidden.*$' + + # Regular expression to remove comment and variant tags, Eg. 'comment-introduction'. + # Used on the site to display text about registered users entering comments. + comment_reg_exp = '^.*comment.*$' + + # Regular expression to remove form and variant tags, Eg. 'comment-form'. + # Used on the site to allow registered BBC users to fill in forms, typically + # for entering comments about an article. + form_reg_exp = '^.*form.*$' + + # Extra things to remove due to the addition of 'blq_content' in keep_only_tags. + + #
Used on sports pages for 'email' and 'print'. + story_actions_reg_exp = '^.*story[_ -]*actions.*$' + + #
Used on sports pages instead of 'share-help' (for + # social networking links). + bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$' + + #
+ # NOTE: Don't remove class="content-group" that is needed. + # Used on sports pages to link to 'similar stories'. + secondary_content_reg_exp = '^.*secondary[_ -]*content.*$' + + #