py3: port use of urllib in recipes

This commit is contained in:
Kovid Goyal 2019-04-01 13:57:21 +05:30
parent 930376c036
commit 569c576293
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
36 changed files with 178 additions and 119 deletions

View File

@ -11,7 +11,10 @@ Change Log:
2013/03/30 -- first version 2013/03/30 -- first version
''' '''
import urllib try:
from urllib.parse import unquote
except ImportError:
from urllib import unquote
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
@ -59,7 +62,7 @@ class AM730(BasicNewsRecipe):
continue # not in same section continue # not in same section
title = href.split('/')[-1].split('-')[0] title = href.split('/')[-1].split('-')[0]
title = urllib.unquote(title.encode('ASCII')) # .decode('utf-8') title = unquote(title.encode('ASCII')) # .decode('utf-8')
if self.debug: if self.debug:
print(title) print(title)
try: try:

View File

@ -9,7 +9,10 @@ http://www.ambito.com/diario/
''' '''
import time import time
import urllib try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
import re import re
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -66,7 +69,7 @@ class Ambito_Financiero(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open(self.INDEX) br.open(self.INDEX)
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
postdata = urllib.urlencode({ postdata = urlencode({
'txtUser': self.username, 'txtUser': self.username,
'txtPassword': self.password 'txtPassword': self.password
}) })
@ -110,7 +113,7 @@ class Ambito_Financiero(BasicNewsRecipe):
if self.session_id: if self.session_id:
l, s, r = url.rpartition('.html') l, s, r = url.rpartition('.html')
o, s1, artid = l.rpartition('_') o, s1, artid = l.rpartition('_')
postdata = urllib.urlencode({'id': artid, 'id_session': self.session_id}) postdata = urlencode({'id': artid, 'id_session': self.session_id})
response = self.browser.open( response = self.browser.open(
'http://data.ambito.com/diario/cuerpo_noticia.asp', 'http://data.ambito.com/diario/cuerpo_noticia.asp',
data=postdata, data=postdata,
@ -128,7 +131,7 @@ class Ambito_Financiero(BasicNewsRecipe):
def cleanup(self): def cleanup(self):
if self.session_id is not None: if self.session_id is not None:
postdata = urllib.urlencode({'session_id': self.session_id}) postdata = urlencode({'session_id': self.session_id})
self.browser.open( self.browser.open(
'http://www.ambito.com/diario/no-cache/login/x_logout.asp', data=postdata, timeout=self.timeout 'http://www.ambito.com/diario/no-cache/login/x_logout.asp', data=postdata, timeout=self.timeout
) )

View File

@ -4,7 +4,10 @@ __copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
azstarnet.com azstarnet.com
''' '''
import urllib try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -31,7 +34,7 @@ class Azstarnet(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open('http://azstarnet.com/') br.open('http://azstarnet.com/')
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
data = urllib.urlencode({'m': 'login', 'u': self.username, 'p': self.password, 'z': 'http://azstarnet.com/' data = urlencode({'m': 'login', 'u': self.username, 'p': self.password, 'z': 'http://azstarnet.com/'
}) })
br.open('http://azstarnet.com/app/registration/proxy.php', data) br.open('http://azstarnet.com/app/registration/proxy.php', data)
return br return br

View File

@ -7,6 +7,9 @@ from __future__ import (unicode_literals, division, absolute_import,
import json import json
from mechanize import Request from mechanize import Request
try:
from urllib.parse import quote
except ImportError:
from urllib import quote from urllib import quote
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -26,7 +29,7 @@ class Barrons(BasicNewsRecipe):
timefmt = ' [%a, %b %d, %Y]' timefmt = ' [%a, %b %d, %Y]'
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*'] match_regexps = ['http://online.barrons.com/.*?html\\?mod=.*?|file:.*']
conversion_options = {'linearize_tables': True} conversion_options = {'linearize_tables': True}
# Don't grab articles more than 7 days old # Don't grab articles more than 7 days old

View File

@ -6,7 +6,10 @@ __copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
www.brecha.com.uy www.brecha.com.uy
''' '''
import urllib try:
from urllib.parse import urlencode, quote
except ImportError:
from urllib import urlencode, quote
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -40,7 +43,7 @@ class Brecha(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open('http://www.brecha.com.uy/index.php/acceder-miembros') br.open('http://www.brecha.com.uy/index.php/acceder-miembros')
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
data = urllib.urlencode({'task': 'login', 'view': 'register', 'username': self.username, 'password': self.password data = urlencode({'task': 'login', 'view': 'register', 'username': self.username, 'password': self.password
}) })
br.open( br.open(
'http://www.brecha.com.uy/index.php/index.php?option=com_osemsc&controller=register', data) 'http://www.brecha.com.uy/index.php/index.php?option=com_osemsc&controller=register', data)
@ -74,5 +77,5 @@ class Brecha(BasicNewsRecipe):
soup = self.index_to_soup('http://www.brecha.com.uy/index.php') soup = self.index_to_soup('http://www.brecha.com.uy/index.php')
for image in soup.findAll('img', alt=True): for image in soup.findAll('img', alt=True):
if image['alt'].startswith('Tapa '): if image['alt'].startswith('Tapa '):
return 'http://www.brecha.com.uy' + urllib.quote(image['src']) return 'http://www.brecha.com.uy' + quote(image['src'])
return None return None

View File

@ -9,7 +9,10 @@ __copyright__ = '2008-2016, Darko Miletic <darko.miletic at gmail.com>'
clarin.com clarin.com
''' '''
import urllib try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -76,7 +79,7 @@ class Clarin(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open(self.INDEX) br.open(self.INDEX)
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
data = urllib.urlencode({'ingresar_ingresar_paseForm': 'ingresar_ingresar_paseForm', data = urlencode({'ingresar_ingresar_paseForm': 'ingresar_ingresar_paseForm',
'ingresar_ingresar_email_paseInputComponent': self.username, 'ingresar_ingresar_email_paseInputComponent': self.username,
'ingresar_ingresar_palabraClave_paseInputComponent': self.password, 'ingresar_ingresar_palabraClave_paseInputComponent': self.password,
'ingresar_ingresar_ingresar_paseButton': 'Ingresar', 'ingresar_ingresar_ingresar_paseButton': 'Ingresar',

View File

@ -1,4 +1,7 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
try:
from urllib.parse import quote
except ImportError:
from urllib import quote from urllib import quote

View File

@ -1,9 +1,8 @@
from __future__ import print_function from __future__ import print_function
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import datetime, timedelta from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import Tag, BeautifulSoup from calibre.ebooks.BeautifulSoup import Tag
from calibre.utils.magick import Image, PixelWand from calibre.utils.magick import Image, PixelWand
from urllib2 import Request, urlopen, URLError
def new_tag(soup, name, attrs=()): def new_tag(soup, name, attrs=()):
@ -113,20 +112,14 @@ class Estadao(BasicNewsRecipe):
def get_cover_url(self): def get_cover_url(self):
if self.THUMBALIZR_API: if self.THUMBALIZR_API:
cover_url = self.CAPA cover_url = self.CAPA
pedido = Request(self.CAPA)
pedido.add_header('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; ' +
self.LANGHTM + '; userid=' + self.THUMBALIZR_API + ') Calibre/0.8.47 (like Gecko)')
pedido.add_header('Accept-Charset', self.ENCHTM)
pedido.add_header('Referer', self.SCREENSHOT)
try: try:
resposta = urlopen(pedido) soup = self.index_to_soup(cover_url)
soup = BeautifulSoup(resposta)
cover_item = soup.find('body') cover_item = soup.find('body')
if cover_item: if cover_item:
cover_url = 'http://api.thumbalizr.com/?api_key=' + self.THUMBALIZR_API + \ cover_url = 'http://api.thumbalizr.com/?api_key=' + self.THUMBALIZR_API + \
'&url=' + self.SCREENSHOT + '&width=600&quality=90' '&url=' + self.SCREENSHOT + '&width=600&quality=90'
return cover_url return cover_url
except URLError: except Exception:
cover_url = 'http://api.thumbalizr.com/?api_key=' + self.THUMBALIZR_API + \ cover_url = 'http://api.thumbalizr.com/?api_key=' + self.THUMBALIZR_API + \
'&url=' + self.SCREENSHOT + '&width=600&quality=90' '&url=' + self.SCREENSHOT + '&width=600&quality=90'
return cover_url return cover_url

View File

@ -5,8 +5,11 @@ __copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.folha.uol.com.br www.folha.uol.com.br
''' '''
import urllib
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
try:
from urllib.parse import quote_plus
except ImportError:
from urllib import quote_plus
class Folha_de_s_paulo(BasicNewsRecipe): class Folha_de_s_paulo(BasicNewsRecipe):
@ -69,7 +72,7 @@ class Folha_de_s_paulo(BasicNewsRecipe):
return curl return curl
def print_version(self, url): def print_version(self, url):
return 'http://tools.folha.com.br/print?site=emcimadahora&url=' + urllib.quote_plus(url) return 'http://tools.folha.com.br/print?site=emcimadahora&url=' + quote_plus(url)
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('http://www.folha.uol.com.br/') soup = self.index_to_soup('http://www.folha.uol.com.br/')

View File

@ -4,8 +4,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import re import re
import shutil import shutil
import urllib
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -46,7 +49,7 @@ class AdvancedUserRecipe1515196393(BasicNewsRecipe):
self.log('\t\tdata-parent-id', parent_id) self.log('\t\tdata-parent-id', parent_id)
self.log('\t\tdata-cat-id', cat_id) self.log('\t\tdata-cat-id', cat_id)
self.log('\t\tdata-post-id', post_id) self.log('\t\tdata-post-id', post_id)
data = urllib.urlencode({'action':'get_content', 'cat_id':cat_id, 'parent_id':parent_id, 'post_id':post_id}) data = urlencode({'action':'get_content', 'cat_id':cat_id, 'parent_id':parent_id, 'post_id':post_id})
r=br.open('http://www.galaxysedge.com/wp-content/themes/galaxyedge/get_content.php', data) r=br.open('http://www.galaxysedge.com/wp-content/themes/galaxyedge/get_content.php', data)
content_file = PersistentTemporaryFile(suffix='.html', dir=self.ctdir) content_file = PersistentTemporaryFile(suffix='.html', dir=self.ctdir)
content_file.write(r.read()) content_file.write(r.read())

View File

@ -4,8 +4,11 @@ __copyright__ = '2010-2015, Darko Miletic <darko.miletic at gmail.com>'
www.haaretz.com www.haaretz.com
''' '''
import urllib
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
class Haaretz_en(BasicNewsRecipe): class Haaretz_en(BasicNewsRecipe):
@ -62,7 +65,7 @@ class Haaretz_en(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open(self.PREFIX) br.open(self.PREFIX)
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
data = urllib.urlencode({'cb': 'parseEngReply', 'newsso': 'true', 'fromlogin': 'true', 'layer': 'eng_login', 'userName': self.username, 'password': self.password # noqa data = urlencode({'cb': 'parseEngReply', 'newsso': 'true', 'fromlogin': 'true', 'layer': 'eng_login', 'userName': self.username, 'password': self.password # noqa
}) })
br.open('https://sso.haaretz.com/sso/sso/signIn', data) br.open('https://sso.haaretz.com/sso/sso/signIn', data)
return br return br

View File

@ -15,7 +15,10 @@ anything in username/password fields
import time import time
import re import re
import urllib try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -61,7 +64,7 @@ class Harpers_full(BasicNewsRecipe):
br.open('https://harpers.org/') br.open('https://harpers.org/')
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
tt = time.localtime() * 1000 tt = time.localtime() * 1000
data = urllib.urlencode({'action': 'cds_auth_user', 'm': self.username, 'p': self.password, 'rt': 'https://harpers.org/', 'tt': tt data = urlencode({'action': 'cds_auth_user', 'm': self.username, 'p': self.password, 'rt': 'https://harpers.org/', 'tt': tt
}) })
br.open(self.LOGIN, data) br.open(self.LOGIN, data)
return br return br

View File

@ -5,8 +5,11 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select from css_selectors import Select
from mechanize import Request from mechanize import Request
from urllib import urlencode
import json import json
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
class HBR(BasicNewsRecipe): class HBR(BasicNewsRecipe):

View File

@ -8,8 +8,6 @@ chron.com
''' '''
import re import re
import time import time
import urllib2
import io
from datetime import datetime from datetime import datetime
import traceback import traceback
import sys import sys
@ -19,8 +17,6 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.BeautifulSoup import NavigableString from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.utils.date import dt_factory, local_tz from calibre.utils.date import dt_factory, local_tz
from lxml import html
from lxml import etree
regex_date_only = re.compile(r"""(?:January|February|March|April| regex_date_only = re.compile(r"""(?:January|February|March|April|
{8}May|June|July|August|September|October|November| {8}May|June|July|August|September|October|November|
@ -62,12 +58,8 @@ def validate_link(page, link, title):
return link, title return link, title
def get_article_parsed(this_url): def get_article_parsed(index_to_soup, this_url):
page = urllib2.urlopen(this_url) return index_to_soup(this_url, as_tree=True)
content = page.read()
parser = etree.HTMLParser()
parsed = html.parse(io.BytesIO(bytes(content)), parser)
return parsed
def sort_subject(element_list): def sort_subject(element_list):
@ -91,8 +83,8 @@ def sort_subject(element_list):
return combined_list return combined_list
def get_links_from_section_page(page): def get_links_from_section_page(index_to_soup, page):
page_doc = get_article_parsed(base_url + page[1][0]) page_doc = get_article_parsed(index_to_soup, base_url + page[1][0])
els = page_doc.xpath(xpath_general) els = page_doc.xpath(xpath_general)
element_list = [] element_list = []
for el in els: for el in els:
@ -110,13 +102,13 @@ def get_links_from_section_page(page):
return [page[0], sorted_element_list] return [page[0], sorted_element_list]
def get_all_links_from_sections(): def get_all_links_from_sections(index_to_soup):
all_sections = [] all_sections = []
article_set = set() article_set = set()
final_dict = OrderedDict() final_dict = OrderedDict()
for item in pages.items(): for item in pages.items():
print("getting links from {0}".format(item[0])) print("getting links from {0}".format(item[0]))
all_sections.append(get_links_from_section_page(item)) all_sections.append(get_links_from_section_page(index_to_soup, item))
for section in all_sections: for section in all_sections:
section_id = section[0] section_id = section[0]
article_list = section[1] article_list = section[1]
@ -232,7 +224,7 @@ class HoustonChronicle(BasicNewsRecipe):
self.timefmt = ' [%a, %d %b, %Y]' self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: ', time.strftime(self.timestampfmt)) self.log('starting parse_index: ', time.strftime(self.timestampfmt))
feeds = [] feeds = []
sections = get_all_links_from_sections() sections = get_all_links_from_sections(self.index_to_soup)
for section_id, article_list in sections.items(): for section_id, article_list in sections.items():
self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list))) self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list)))
articles = [] articles = []

View File

@ -7,6 +7,9 @@ import urlparse, re
import json import json
from uuid import uuid4 from uuid import uuid4
from mechanize import Request from mechanize import Request
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode from urllib import urlencode
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe

View File

@ -1,4 +1,7 @@
import urllib2 try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -39,7 +42,7 @@ class JBPress(BasicNewsRecipe):
return br return br
def print_version(self, url): def print_version(self, url):
url = urllib2.urlopen(url).geturl() # resolve redirect. url = urlopen(url).geturl() # resolve redirect.
return url.replace('/-/', '/print/') return url.replace('/-/', '/print/')
def preprocess_html(self, soup): def preprocess_html(self, soup):

View File

@ -5,6 +5,9 @@ www.jornada.unam.mx
''' '''
import re import re
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode from urllib import urlencode
from urlparse import urlparse, urlunparse, parse_qs from urlparse import urlparse, urlunparse, parse_qs
from calibre import strftime from calibre import strftime

View File

@ -5,7 +5,10 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
lanacion.cl lanacion.cl
''' '''
import urllib try:
from urllib.parse import quote
except ImportError:
from urllib import quote
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -36,7 +39,7 @@ class LaNacionChile(BasicNewsRecipe):
feeds = [(u'Noticias', u'http://www.lanacion.cl/rss.xml')] feeds = [(u'Noticias', u'http://www.lanacion.cl/rss.xml')]
def print_version(self, url): def print_version(self, url):
toprint = urllib.quote(url, ':/') toprint = quote(url, ':/')
return u'http://www.lanacion.cl/cgi-bx/imprimir.cgi?_URL=' + toprint return u'http://www.lanacion.cl/cgi-bx/imprimir.cgi?_URL=' + toprint
def preprocess_html(self, soup): def preprocess_html(self, soup):

View File

@ -7,7 +7,6 @@ Lemonde.fr: Version abonnée
''' '''
import os, zipfile, re, time import os, zipfile, re, time
from urllib2 import HTTPError
from calibre.constants import preferred_encoding from calibre.constants import preferred_encoding
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -97,7 +96,7 @@ class LeMondeAbonne(BasicNewsRecipe):
try: try:
response = browser.open(url) response = browser.open(url)
continue continue
except HTTPError: except Exception:
second -= 24 * 60 * 60 second -= 24 * 60 * 60
tmp = PersistentTemporaryFile(suffix='.zip') tmp = PersistentTemporaryFile(suffix='.zip')

View File

@ -4,7 +4,10 @@ __copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
mondediplo.com mondediplo.com
''' '''
import urllib try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -43,7 +46,7 @@ class LeMondeDiplomatiqueEn(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open(self.LOGIN) br.open(self.LOGIN)
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
data = urllib.urlencode({'login': self.username, 'pass': self.password, 'enter': 'enter' data = urlencode({'login': self.username, 'pass': self.password, 'enter': 'enter'
}) })
br.open(self.LOGIN, data) br.open(self.LOGIN, data)
return br return br

View File

@ -3,7 +3,10 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.constants import config_dir, CONFIG_DIR_MODE from calibre.constants import config_dir, CONFIG_DIR_MODE
import os import os
import os.path import os.path
import urllib try:
from urllib.parse import quote
except ImportError:
from urllib import quote
from hashlib import md5 from hashlib import md5
@ -62,7 +65,7 @@ class ModorosBlogHu(BasicNewsRecipe):
feeds = BasicNewsRecipe.parse_feeds(self) feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds: for feed in feeds:
feed_hash = urllib.quote(feed.title.encode('utf-8'), safe='') feed_hash = quote(feed.title.encode('utf-8'), safe='')
feed_fn = os.path.join(feed_dir, feed_hash) feed_fn = os.path.join(feed_dir, feed_hash)
past_items = set() past_items = set()

View File

@ -6,7 +6,6 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Starson17' __copyright__ = '2010, Starson17'
import os import os
import urllib2
import zipfile import zipfile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
@ -25,10 +24,10 @@ class NowToronto(BasicNewsRecipe):
epub_feed = "http://feeds.feedburner.com/NowEpubEditions" epub_feed = "http://feeds.feedburner.com/NowEpubEditions"
soup = self.index_to_soup(epub_feed) soup = self.index_to_soup(epub_feed)
url = soup.find(name='feedburner:origlink').string url = soup.find(name='feedburner:origlink').string
f = urllib2.urlopen(url) raw = self.index_to_soup(url, raw=True)
tmp = PersistentTemporaryFile(suffix='.epub') tmp = PersistentTemporaryFile(suffix='.epub')
self.report_progress(0, _('downloading epub')) self.report_progress(0, _('downloading epub'))
tmp.write(f.read()) tmp.write(raw)
tmp.close() tmp.close()
zfile = zipfile.ZipFile(tmp.name, 'r') zfile = zipfile.ZipFile(tmp.name, 'r')
self.report_progress(0, _('extracting epub')) self.report_progress(0, _('extracting epub'))

View File

@ -4,7 +4,11 @@ __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
www.nursingtimes.net www.nursingtimes.net
''' '''
import urllib try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
@ -31,7 +35,7 @@ class NursingTimes(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open(self.LOGIN) br.open(self.LOGIN)
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
data = urllib.urlencode({'campaigncode': '0', 'referrer': '', 'security_text': '', 'SIemail': self.username, 'passWord': self.password, 'LoginButton.x': '27', 'LoginButton.y': '13' # noqa data = urlencode({'campaigncode': '0', 'referrer': '', 'security_text': '', 'SIemail': self.username, 'passWord': self.password, 'LoginButton.x': '27', 'LoginButton.y': '13' # noqa
}) })
br.open(self.LOGIN, data) br.open(self.LOGIN, data)
return br return br

View File

@ -3,7 +3,6 @@
from __future__ import unicode_literals, division, absolute_import, print_function from __future__ import unicode_literals, division, absolute_import, print_function
import time import time
import json import json
import urllib
from pprint import pprint from pprint import pprint
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -87,9 +86,7 @@ class OrangeCountyRegister(BasicNewsRecipe):
return cleanedHTML return cleanedHTML
def loadURL(self, url): def loadURL(self, url):
socket = urllib.urlopen(url) return self.index_to_soup(url, raw=True)
rawHTML = socket.read()
return rawHTML
def htmlToAttribsDict(self, rawHTML): def htmlToAttribsDict(self, rawHTML):
tokenStart = 'dataLayer.push({' tokenStart = 'dataLayer.push({'

View File

@ -3,7 +3,10 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.constants import config_dir, CONFIG_DIR_MODE from calibre.constants import config_dir, CONFIG_DIR_MODE
import os import os
import os.path import os.path
import urllib try:
from urllib.parse import quote
except ImportError:
from urllib import quote
from hashlib import md5 from hashlib import md5
@ -86,7 +89,7 @@ class OfficeSpaceBlogHu(BasicNewsRecipe):
feeds = BasicNewsRecipe.parse_feeds(self) feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds: for feed in feeds:
feed_hash = urllib.quote(feed.title.encode('utf-8'), safe='') feed_hash = quote(feed.title.encode('utf-8'), safe='')
feed_fn = os.path.join(feed_dir, feed_hash) feed_fn = os.path.join(feed_dir, feed_hash)
past_items = set() past_items = set()

View File

@ -1,5 +1,8 @@
import urllib
import re import re
try:
from urllib.parse import unquote
except ImportError:
from urllib import unquote
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -37,7 +40,7 @@ class AdvancedUserRecipe1279258912(BasicNewsRecipe):
ans = None ans = None
try: try:
s = article.summary s = article.summary
ans = urllib.unquote( ans = unquote(
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1)) re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
except: except:
pass pass

View File

@ -8,8 +8,14 @@ import json
import operator import operator
import re import re
import tempfile import tempfile
import urllib try:
import urllib2 from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
try:
from urllib.error import HTTPError, URLError
except ImportError:
from urllib2 import HTTPError, URLError
__license__ = 'GPL v3' __license__ = 'GPL v3'
@ -99,18 +105,8 @@ class Pocket(BasicNewsRecipe):
self.get_auth_uri(), self.get_auth_uri(),
self.get_pull_articles_uri() self.get_pull_articles_uri()
) )
try: data = self.index_to_soup(fetch_url, raw=True)
request = urllib2.Request(fetch_url) pocket_feed = json.loads(data)['list']
response = urllib2.urlopen(request)
pocket_feed = json.load(response)['list']
except urllib2.HTTPError as e:
self.log.exception(
"Pocket returned an error: {0}".format(e.info()))
return []
except urllib2.URLError as e:
self.log.exception(
"Unable to connect to getpocket.com's api: {0}\nurl: {1}".format(e, fetch_url))
return []
if len(pocket_feed) < self.minimum_articles: if len(pocket_feed) < self.minimum_articles:
self.mark_as_read_after_dl = False self.mark_as_read_after_dl = False
@ -143,10 +139,10 @@ class Pocket(BasicNewsRecipe):
fc_tag = soup.find('script', text=re.compile("formCheck")) fc_tag = soup.find('script', text=re.compile("formCheck"))
fc_id = re.search(r"formCheck = \'([\d\w]+)\';", fc_tag).group(1) fc_id = re.search(r"formCheck = \'([\d\w]+)\';", fc_tag).group(1)
article_id = url.split("/")[-1] article_id = url.split("/")[-1]
data = urllib.urlencode({'itemId': article_id, 'formCheck': fc_id}) data = urlencode({'itemId': article_id, 'formCheck': fc_id})
try: try:
response = self.browser.open(ajax_url, data) response = self.browser.open(ajax_url, data)
except urllib2.HTTPError as e: except HTTPError as e:
self.log.exception("unable to get textview {0}".format(e.info())) self.log.exception("unable to get textview {0}".format(e.info()))
raise e raise e
return json.load(response)['article'] return json.load(response)['article']
@ -186,13 +182,12 @@ class Pocket(BasicNewsRecipe):
self.get_auth_uri() self.get_auth_uri()
) )
try: try:
request = urllib2.Request(mark_read_url) self.browser.open_novisit(mark_read_url)
urllib2.urlopen(request) except HTTPError as e:
except urllib2.HTTPError as e:
self.log.exception( self.log.exception(
'Pocket returned an error while archiving articles: {0}'.format(e)) 'Pocket returned an error while archiving articles: {0}'.format(e))
return [] return []
except urllib2.URLError as e: except URLError as e:
self.log.exception( self.log.exception(
"Unable to connect to getpocket.com's modify api: {0}".format(e)) "Unable to connect to getpocket.com's modify api: {0}".format(e))
return [] return []

View File

@ -1,5 +1,4 @@
import re import re
import urllib2
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -54,7 +53,7 @@ class Ebert(BasicNewsRecipe):
self.report_progress(0, _('Fetching feed') + ' %s...' % self.report_progress(0, _('Fetching feed') + ' %s...' %
(feedtitle if feedtitle else feedurl)) (feedtitle if feedtitle else feedurl))
articles = [] articles = []
page = urllib2.urlopen(feedurl).read() page = self.index_to_soup(feedurl, raw=True)
if feedtitle == 'Reviews' or feedtitle == 'Great Movies': if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
pattern = self.patternReviews pattern = self.patternReviews

View File

@ -1,5 +1,4 @@
import re import re
import urllib2
import time import time
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime from calibre import strftime
@ -68,7 +67,7 @@ class Ebert(BasicNewsRecipe):
self.report_progress(0, _('Fetching feed') + ' %s...' % self.report_progress(0, _('Fetching feed') + ' %s...' %
(feedtitle if feedtitle else feedurl)) (feedtitle if feedtitle else feedurl))
articles = [] articles = []
page = urllib2.urlopen(feedurl).read() page = self.index_to_soup(feedurl, raw=True)
if feedtitle == 'Reviews' or feedtitle == 'Great Movies': if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
pattern = self.patternReviews pattern = self.patternReviews

View File

@ -3,8 +3,11 @@ __copyright__ = '2010-2013, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.thetimes.co.uk/magazine/the-sunday-times-magazine/ www.thetimes.co.uk/magazine/the-sunday-times-magazine/
''' '''
import urllib
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
def classes(classes): def classes(classes):
@ -49,7 +52,7 @@ class TimesOnline(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open('http://www.thetimes.co.uk/') br.open('http://www.thetimes.co.uk/')
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
data = urllib.urlencode({ data = urlencode({
'gotoUrl': self.INDEX, 'gotoUrl': self.INDEX,
'username': self.username, 'username': self.username,
'password': self.password}) 'password': self.password})

View File

@ -9,10 +9,14 @@ __docformat__ = 'restructuredtext de'
www.taz.de/digiabo www.taz.de/digiabo
''' '''
import os import os
import urllib2
import zipfile import zipfile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
try:
from urllib.request import HTTPBasicAuthHandler, build_opener, install_opener, urlopen
from urllib.error import HTTPError
except ImportError:
from urllib2 import HTTPBasicAuthHandler, build_opener, install_opener, urlopen, HTTPError
class TazDigiabo(BasicNewsRecipe): class TazDigiabo(BasicNewsRecipe):
@ -34,17 +38,17 @@ class TazDigiabo(BasicNewsRecipe):
url = domain + "/epub/" url = domain + "/epub/"
auth_handler = urllib2.HTTPBasicAuthHandler() auth_handler = HTTPBasicAuthHandler()
auth_handler.add_password(realm='TAZ-ABO', auth_handler.add_password(realm='TAZ-ABO',
uri=url, uri=url,
user=self.username, user=self.username,
passwd=self.password) passwd=self.password)
opener = urllib2.build_opener(auth_handler) opener = build_opener(auth_handler)
urllib2.install_opener(opener) install_opener(opener)
try: try:
f = urllib2.urlopen(url) f = urlopen(url)
except urllib2.HTTPError: except HTTPError:
self.report_progress(0, _('Can\'t login to download issue')) self.report_progress(0, _('Can\'t login to download issue'))
raise ValueError('Failed to login, check your username and' raise ValueError('Failed to login, check your username and'
' password') ' password')

View File

@ -9,9 +9,12 @@ __copyright__ = '2019, Darko Miletic <darko.miletic at gmail.com>'
www.newcriterion.com www.newcriterion.com
''' '''
import urllib try:
import urllib2 from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
import re import re
from mechanize import Request
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
@ -52,7 +55,7 @@ class TheNewCriterion(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open('https://www.newcriterion.com/') br.open('https://www.newcriterion.com/')
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
data = urllib.urlencode({'login': self.username, 'password': self.password}) data = urlencode({'login': self.username, 'password': self.password})
header = { header = {
'X-OCTOBER-REQUEST-HANDLER': 'onSignin', 'X-OCTOBER-REQUEST-HANDLER': 'onSignin',
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
@ -60,7 +63,7 @@ class TheNewCriterion(BasicNewsRecipe):
'X-OCTOBER-REQUEST-PARTIALS':'', 'X-OCTOBER-REQUEST-PARTIALS':'',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
} }
request = urllib2.Request('https://www.newcriterion.com/', data, header) request = Request('https://www.newcriterion.com/', data, header)
br.open(request) br.open(request)
return br return br

View File

@ -3,8 +3,11 @@ __copyright__ = '2010-2017, Bobby Steel <bob at xdca.com>, Darko Miletic'
''' '''
www.thetimes.co.uk www.thetimes.co.uk
''' '''
import urllib
import html5lib import html5lib
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
from lxml import html from lxml import html
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -79,7 +82,7 @@ class TimesOnline(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open('http://www.thetimes.co.uk/') br.open('http://www.thetimes.co.uk/')
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
data = urllib.urlencode({ data = urlencode({
'gotoUrl': self.INDEX, 'gotoUrl': self.INDEX,
'username': self.username, 'username': self.username,
'password': self.password}) 'password': self.password})

View File

@ -4,7 +4,10 @@ __copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
tomshardware.com/us tomshardware.com/us
''' '''
import urllib try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
@ -30,7 +33,7 @@ class Tomshardware(BasicNewsRecipe):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open(self.INDEX + '/us/') br.open(self.INDEX + '/us/')
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
data = urllib.urlencode({'action': 'login_action', 'r': self.INDEX + '/us/', 'login': self.username, 'mdp': self.password data = urlencode({'action': 'login_action', 'r': self.INDEX + '/us/', 'login': self.username, 'mdp': self.password
}) })
br.open(self.LOGIN, data) br.open(self.LOGIN, data)
return br return br

View File

@ -5,8 +5,11 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json import json
from urllib import quote
try:
from urllib.parse import quote
except ImportError:
from urllib import quote
from mechanize import Request from mechanize import Request
from calibre import random_user_agent from calibre import random_user_agent

View File

@ -5,6 +5,9 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json import json
try:
from urllib.parse import quote
except ImportError:
from urllib import quote from urllib import quote
from mechanize import Request from mechanize import Request