Попытка получить сайт SSLv3 с помощью urllib2 вызывает исключение httplib.BadStatusLine

Я пытаюсь прочитать сайт в BeautifulSoup, и до сих пор каждая попытка не удалась в тот момент, когда я пытаюсь открыть безопасное соединение (я изначально пытался приблизиться к этому с Python 3, но, как вы можете видеть, это также чревато опасностью), Вот моя последняя попытка использования urllib2 (я не нашел пример urllib3 или добился большого успеха при обновлении этого кода до urllib3):

import httplib, ssl, urllib2, socket
from bs4 import BeautifulSoup

class HTTPSConnectionV3(httplib.HTTPSConnection):
    def __init__(self, *args, **kwargs):
        httplib.HTTPSConnection.__init__(self, *args, **kwargs)
        def connect(self):
            sock = socket.create_connection((self.host, self.port), self.timeout)
            if self._tunnel_host:
                self.sock = sock
                self._tunnel()
                try:
                    self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
                except ssl.SSLError, e:
                    print("Trying SSLv3.")
                    self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)

class HTTPSHandlerV3(urllib2.HTTPSHandler):
    def https_open(self, req):
        return self.do_open(HTTPSConnectionV3, req)

# install opener
urllib2.install_opener(urllib2.build_opener(HTTPSHandlerV3()))

r = urllib2.urlopen('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm')

s= r.read()

soup = BeautifulSoup(s)
for t in soup.findAll('h2'):
    print(t)

Когда я запускаю этот код, я получаю следующую трассировку стека:

Traceback (most recent call last):
  File "test.py", line 27, in <module>
    r = urllib2.urlopen('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm')
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 127, in urlopen
    return _opener.open(url, data, timeout)
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 404, in open
    response = self._open(req, data)
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 422, in _open
    '_open', req)
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 382, in _call_chain
    result = func(*args)
  File "test.py", line 22, in https_open
    return self.do_open(HTTPSConnectionV3, req)
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1187, in do_open
    r = h.getresponse(buffering=True)
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1045, in getresponse
    response.begin()
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 409, in begin
    version, status, reason = self._read_status()
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 373, in _read_status
    raise BadStatusLine(line)
httplib.BadStatusLine: ''

Чтобы усложнить ситуацию, вот что я вижу, когда сверляю URL:

$ curl -v https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm
* Adding handle: conn: 0x7ff1f1804000
* Adding handle: send: 0
* Adding handle: recv: 0
* Curl_addHandleToPipeline: length: 1
* - Conn 0 (0x7ff1f1804000) send_pipe: 1, recv_pipe: 0
* About to connect() to bw6.clpccd.cc.ca.us port 443 (#0)
*   Trying 205.155.225.145...
* Connected to bw6.clpccd.cc.ca.us (205.155.225.145) port 443 (#0)
* Server aborted the SSL handshake
* Closing connection 0
curl: (35) Server aborted the SSL handshake

Если я использую SSLv3, я получаю ожидаемый результат:

$ curl -v -3 https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm
* Adding handle: conn: 0x7ffa50804000
* Adding handle: send: 0
* Adding handle: recv: 0
* Curl_addHandleToPipeline: length: 1
* - Conn 0 (0x7ffa50804000) send_pipe: 1, recv_pipe: 0
* About to connect() to bw6.clpccd.cc.ca.us port 443 (#0)
*   Trying 205.155.225.145...
* Connected to bw6.clpccd.cc.ca.us (205.155.225.145) port 443 (#0)
* SSL 3.0 connection using SSL_RSA_WITH_RC4_128_SHA
* Server certificate: bw6.clpccd.cc.ca.us
* Server certificate: VeriSign Class 3 Secure Server CA - G3
* Server certificate: VeriSign Class 3 Public Primary Certification Authority - G5
* Server certificate: Class 3 Public Primary Certification Authority
> GET /clpccd/2014/02/sched_l.htm HTTP/1.1
> User-Agent: Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)
> Host: bw6.clpccd.cc.ca.us
> Accept: */*
> Referer: 
> 
< HTTP/1.1 200 OK
< Date: Thu, 23 Oct 2014 00:00:11 GMT
* Server Oracle-Application-Server-10g/10.1.3.4.0 Oracle-HTTP-Server is not blacklisted
< Server: Oracle-Application-Server-10g/10.1.3.4.0 Oracle-HTTP-Server
< Last-Modified: Wed, 22 Oct 2014 20:05:42 GMT
< ETag: "422e-1e72-54480e16"
< Accept-Ranges: bytes
< Content-Length: 7794
< Connection: close
< Content-Type: text/html
< 
<html....>

* Closing connection 0

В случае, если моя предыдущая попытка кому-то здесь помогла, вот как выглядел мой подход, когда я использовал библиотеку запросов в Python 3 (следуя их документации для примера: конкретная версия SSL¶)

import ssl
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager

from bs4 import BeautifulSoup

headers = {'User-agent': 'Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}

class Ssl3HttpAdapter(HTTPAdapter):
    """"Transport adapter" that allows us to use SSLv3."""
    def init_poolmanager(self, connections, maxsize, block=True):
        self.poolmanager = PoolManager(num_pools=connections,
                                        maxsize=maxsize,
                                        block=block,
                                        ssl_version=ssl.PROTOCOL_SSLv3)

s = requests.session()
s.mount('https://bw6.clpccd.cc.ca.us', Ssl3HttpAdapter())

r = s.get('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm', headers=headers)

soup = BeautifulSoup(r.text)
for t in soup.findAll('h2'):
    print(t)

Который дал похожую (но более загадочную) трассировку стека:

Traceback (most recent call last):
  File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 331, in _make_request
    httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 516, in urlopen
    body=body, headers=headers)
  File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 333, in _make_request
    httplib_response = conn.getresponse()
  File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 1172, in getresponse
    response.begin()
  File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 351, in begin
    version, status, reason = self._read_status()
  File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 321, in _read_status
    raise BadStatusLine(line)
http.client.BadStatusLine: ''

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.4/site-packages/requests/adapters.py", line 362, in send
    timeout=timeout
  File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 559, in urlopen
    _pool=self, _stacktrace=stacktrace)
  File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/util/retry.py", line 245, in increment
    raise six.reraise(type(error), error, _stacktrace)
  File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/packages/six.py", line 309, in reraise
    raise value.with_traceback(tb)
  File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 516, in urlopen
    body=body, headers=headers)
  File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 333, in _make_request
    httplib_response = conn.getresponse()
  File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 1172, in getresponse
    response.begin()
  File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 351, in begin
    version, status, reason = self._read_status()
  File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 321, in _read_status
    raise BadStatusLine(line)
requests.packages.urllib3.exceptions.ProtocolError: ('Connection aborted.', BadStatusLine("''",))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "./test.py", line 23, in <module>
    r = s.get('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm', headers=headers)
  File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 469, in get
    return self.request('GET', url, **kwargs)
  File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 457, in request
    resp = self.send(prep, **send_kwargs)
  File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 569, in send
    r = adapter.send(request, **kwargs)
  File "/usr/local/lib/python3.4/site-packages/requests/adapters.py", line 407, in send
    raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BadStatusLine("''",))

1 ответ

Чтобы добиться прогресса, я написал этот код, хотя я не удовлетворен тем, что это ответ Pythonic:

import os
import subprocess

from bs4 import BeautifulSoup
FNULL = open(os.devnull, 'w')

html = subprocess.Popen(["curl", "-3", "https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm", stdout=subprocess.PIPE, stderr=FNULL).communicate()[0]

soup = BeautifulSoup(html)

for t in soup.findAll('h2'):
    print(t.text)

В основном я просто звоню curl -3 <url> и захват результатов.

Другие вопросы по тегам