Попытка получить сайт SSLv3 с помощью urllib2 вызывает исключение httplib.BadStatusLine
Я пытаюсь прочитать сайт в BeautifulSoup, и до сих пор каждая попытка не удалась в тот момент, когда я пытаюсь открыть безопасное соединение (я изначально пытался приблизиться к этому с Python 3, но, как вы можете видеть, это также чревато опасностью), Вот моя последняя попытка использования urllib2 (я не нашел пример urllib3 или добился большого успеха при обновлении этого кода до urllib3):
import httplib, ssl, urllib2, socket
from bs4 import BeautifulSoup
class HTTPSConnectionV3(httplib.HTTPSConnection):
def __init__(self, *args, **kwargs):
httplib.HTTPSConnection.__init__(self, *args, **kwargs)
def connect(self):
sock = socket.create_connection((self.host, self.port), self.timeout)
if self._tunnel_host:
self.sock = sock
self._tunnel()
try:
self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
except ssl.SSLError, e:
print("Trying SSLv3.")
self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
class HTTPSHandlerV3(urllib2.HTTPSHandler):
def https_open(self, req):
return self.do_open(HTTPSConnectionV3, req)
# install opener
urllib2.install_opener(urllib2.build_opener(HTTPSHandlerV3()))
r = urllib2.urlopen('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm')
s= r.read()
soup = BeautifulSoup(s)
for t in soup.findAll('h2'):
print(t)
Когда я запускаю этот код, я получаю следующую трассировку стека:
Traceback (most recent call last):
File "test.py", line 27, in <module>
r = urllib2.urlopen('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm')
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "test.py", line 22, in https_open
return self.do_open(HTTPSConnectionV3, req)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1187, in do_open
r = h.getresponse(buffering=True)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1045, in getresponse
response.begin()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 409, in begin
version, status, reason = self._read_status()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 373, in _read_status
raise BadStatusLine(line)
httplib.BadStatusLine: ''
Чтобы усложнить ситуацию, вот что я вижу, когда сверляю URL:
$ curl -v https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm
* Adding handle: conn: 0x7ff1f1804000
* Adding handle: send: 0
* Adding handle: recv: 0
* Curl_addHandleToPipeline: length: 1
* - Conn 0 (0x7ff1f1804000) send_pipe: 1, recv_pipe: 0
* About to connect() to bw6.clpccd.cc.ca.us port 443 (#0)
* Trying 205.155.225.145...
* Connected to bw6.clpccd.cc.ca.us (205.155.225.145) port 443 (#0)
* Server aborted the SSL handshake
* Closing connection 0
curl: (35) Server aborted the SSL handshake
Если я использую SSLv3, я получаю ожидаемый результат:
$ curl -v -3 https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm
* Adding handle: conn: 0x7ffa50804000
* Adding handle: send: 0
* Adding handle: recv: 0
* Curl_addHandleToPipeline: length: 1
* - Conn 0 (0x7ffa50804000) send_pipe: 1, recv_pipe: 0
* About to connect() to bw6.clpccd.cc.ca.us port 443 (#0)
* Trying 205.155.225.145...
* Connected to bw6.clpccd.cc.ca.us (205.155.225.145) port 443 (#0)
* SSL 3.0 connection using SSL_RSA_WITH_RC4_128_SHA
* Server certificate: bw6.clpccd.cc.ca.us
* Server certificate: VeriSign Class 3 Secure Server CA - G3
* Server certificate: VeriSign Class 3 Public Primary Certification Authority - G5
* Server certificate: Class 3 Public Primary Certification Authority
> GET /clpccd/2014/02/sched_l.htm HTTP/1.1
> User-Agent: Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)
> Host: bw6.clpccd.cc.ca.us
> Accept: */*
> Referer:
>
< HTTP/1.1 200 OK
< Date: Thu, 23 Oct 2014 00:00:11 GMT
* Server Oracle-Application-Server-10g/10.1.3.4.0 Oracle-HTTP-Server is not blacklisted
< Server: Oracle-Application-Server-10g/10.1.3.4.0 Oracle-HTTP-Server
< Last-Modified: Wed, 22 Oct 2014 20:05:42 GMT
< ETag: "422e-1e72-54480e16"
< Accept-Ranges: bytes
< Content-Length: 7794
< Connection: close
< Content-Type: text/html
<
<html....>
* Closing connection 0
В случае, если моя предыдущая попытка кому-то здесь помогла, вот как выглядел мой подход, когда я использовал библиотеку запросов в Python 3 (следуя их документации для примера: конкретная версия SSL¶)
import ssl
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
from bs4 import BeautifulSoup
headers = {'User-agent': 'Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}
class Ssl3HttpAdapter(HTTPAdapter):
""""Transport adapter" that allows us to use SSLv3."""
def init_poolmanager(self, connections, maxsize, block=True):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=ssl.PROTOCOL_SSLv3)
s = requests.session()
s.mount('https://bw6.clpccd.cc.ca.us', Ssl3HttpAdapter())
r = s.get('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm', headers=headers)
soup = BeautifulSoup(r.text)
for t in soup.findAll('h2'):
print(t)
Который дал похожую (но более загадочную) трассировку стека:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 331, in _make_request
httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 516, in urlopen
body=body, headers=headers)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 333, in _make_request
httplib_response = conn.getresponse()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 1172, in getresponse
response.begin()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 351, in begin
version, status, reason = self._read_status()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 321, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine: ''
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/requests/adapters.py", line 362, in send
timeout=timeout
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 559, in urlopen
_pool=self, _stacktrace=stacktrace)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/util/retry.py", line 245, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/packages/six.py", line 309, in reraise
raise value.with_traceback(tb)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 516, in urlopen
body=body, headers=headers)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 333, in _make_request
httplib_response = conn.getresponse()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 1172, in getresponse
response.begin()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 351, in begin
version, status, reason = self._read_status()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 321, in _read_status
raise BadStatusLine(line)
requests.packages.urllib3.exceptions.ProtocolError: ('Connection aborted.', BadStatusLine("''",))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "./test.py", line 23, in <module>
r = s.get('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm', headers=headers)
File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 469, in get
return self.request('GET', url, **kwargs)
File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 457, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 569, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.4/site-packages/requests/adapters.py", line 407, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BadStatusLine("''",))
1 ответ
Чтобы добиться прогресса, я написал этот код, хотя я не удовлетворен тем, что это ответ Pythonic:
import os
import subprocess
from bs4 import BeautifulSoup
FNULL = open(os.devnull, 'w')
html = subprocess.Popen(["curl", "-3", "https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm", stdout=subprocess.PIPE, stderr=FNULL).communicate()[0]
soup = BeautifulSoup(html)
for t in soup.findAll('h2'):
print(t.text)
В основном я просто звоню curl -3 <url>
и захват результатов.