Предотвратить ошибку 503 при очистке Google Scholar

Я написал следующий код для очистки данных со страницы безопасности Google Scholar., Однако всякий раз, когда я запускаю его, я получаю эту ошибку:

 Traceback (most recent call last):
  File "/Users/.../Documents/GS_Tag_Scraper/scrape-modified.py", line 53, in <module>
    getProfileFromTag(each)
  File "/Users/.../Documents/GS_Tag_Scraper/scrape-modified.py", line 32, in getProfileFromTag
    page = urllib.request.urlopen(url)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 163, in urlopen
    return opener.open(url, data, timeout)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 472, in open
    response = meth(req, response)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 582, in http_response
    'http', request, response, code, msg, hdrs)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 504, in error
    result = self._call_chain(*args)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 444, in _call_chain
    result = func(*args)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 696, in http_error_302
    return self.parent.open(new, timeout=req.timeout)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 472, in open
    response = meth(req, response)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 582, in http_response
    'http', request, response, code, msg, hdrs)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 510, in error
    return self._call_chain(*args)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 444, in _call_chain
    result = func(*args)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 590, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 503: Service Unavailable

Я предполагаю, что это потому, что GS блокирует мои запросы. Как я могу предотвратить это?

Код является:

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request
import string
import csv
import time

#Declares array's to store data
name = []
urlList =[]

#Opens and writer header of CSV file
outputFile = open('sample.csv', 'w', newline='')
outputWriter = csv.writer(outputFile)
outputWriter.writerow(['Name', 'URL', 'Total Citations', 'h-index', 'i10-index'])

def getStat (url):
    #Given an authors URL it retunrs an array of stats.
    url = 'https://scholar.google.pl' + url
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, 'lxml')
    buttons = soup.findAll("td", { "class" : "gsc_rsb_std" })
    list=[]
    return (list)

def getProfileFromTag(tag):
    url = "http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:" + tag
    while True:
        page = urllib.request.urlopen(url)
        soup = BeautifulSoup(page, 'lxml')

        mydivs = BeautifulSoup(urllib.request.urlopen(url), 'lxml').findAll("h3", { "class" : "gsc_1usr_name"})
        for each in mydivs:
            for anchor in each.find_all('a'):
                name.append(anchor.text)
                urlList.append(anchor['href'])
                time.sleep(0.001)
        buttons = soup.findAll("button", {"aria-label": "Następna"})
        if not buttons:
            break
        on_click = buttons[0].get('onclick')
        url = 'http://scholar.google.pl' + on_click[17:-1]
        url = url.encode('utf-8').decode('unicode_escape')
    for each in name:
        list = getStat(urlList[i])
        outputWriter.writerow([each, urlList[i], list[0], list[2], list[4]])

tags = ['security']
for each in tags:
    getProfileFromTag(each)

2 ответа

Использование requests вместе с соответствующими заголовками запросов.

import requests

url = 'https://scholar.google.pl/citations?view_op=search_authors&mauthors=label:security'

request_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

with requests.Session() as s:
    r = s.get(url, headers=request_headers)

В результате вы получите:

Adrian Perrig    /citations?user=n-Oret4AAAAJ&hl=pl
Vern Paxson      /citations?user=HvwPRJ0AAAAJ&hl=pl
Frans Kaashoek   /citations?user=YCoLskoAAAAJ&hl=pl
Mihir Bellare    /citations?user=2pW1g5IAAAAJ&hl=pl
Xuemin Shen      /citations?user=Bjl3GwoAAAAJ&hl=pl
Helen J. Wang    /citations?user=qhu-DxwAAAAJ&hl=pl
Sushil Jajodia   /citations?user=lOZ1vHIAAAAJ&hl=pl
Martin Abadi     /citations?user=vWTI60AAAAAJ&hl=pl
Jean-Pierre Hubaux   /citations?user=W7YBLlEAAAAJ&hl=pl
Ross Anderson    /citations?user=WgyDcoUAAAAJ&hl=pl

используя это:

users = soup.findAll('h3', {'class': 'gsc_oai_name'})
for user in users:
    name = user.a.text.strip()
    link = user.a['href']
    print(name, '\t', link)

Вы можете найти заголовки, которые отправляет браузер, изучив вкладку сети инструментов разработчика Chrome.

Я предполагаю, что вы пытались проанализировать токен следующей страницы. Однако, если нет, это связано с тем, что вы не предоставили идентификатор маркера следующей страницы для запроса после его анализа. Или вы достигли предела скорости IP или получили CAPTCHA. К сожалению, заголовков запросов недостаточно.

Идеальным решением, помимо передачи токена следующей страницы, является использование прокси с сервисом решения CAPTCHA.


Код и пример в онлайн-IDE:

      from bs4 import BeautifulSoup
import requests, lxml, re


def scrape_all_authors():
    params = {
        "view_op": "search_authors",  # author results 
        "mauthors": "label:security", # search query
        "hl": "en",                   # language
        "gl": "us",                   # country to search from 
        "astart": 0                   # page number
    }

    authors_is_present = True
    while authors_is_present:
       
        # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3538.102 Safari/537.36 Edge/18.19582",
        }

        html = requests.get("https://scholar.google.com/citations", params=params, headers=headers, timeout=30)
        soup = BeautifulSoup(html.text, "lxml")

        for author in soup.select(".gs_ai_chpr"):
            name = author.select_one(".gs_ai_name a").text
            link = f'https://scholar.google.com{author.select_one(".gs_ai_name a")["href"]}'
            affiliations = author.select_one(".gs_ai_aff").text
            email = author.select_one(".gs_ai_eml").text
            try:
                cited_by = re.search(r"\d+", author.select_one(".gs_ai_cby").text).group() # Cited by 17143 -> 17143
            except: cited_by = None

            print(f"extracting authors at page #{params['astart']}.",
                  name,
                  link,
                  affiliations,
                  email,
                  cited_by, sep="\n")

        # if next page token
        if soup.select_one("button.gs_btnPR")["onclick"]:
            params["after_author"] = re.search(r"after_author\\x3d(.*)\\x26", str(soup.select_one("button.gs_btnPR")["onclick"])).group(1)  # -> XB0HAMS9__8J
            params["astart"] += 10
        else:
            authors_is_present = False

scrape_all_authors()


# output:
'''
extracting authors at page #0.
Johnson Thomas
https://scholar.google.com/citations?hl=en&user=eKLr0EgAAAAJ
Professor of Computer Science, Oklahoma State University
Verified email at cs.okstate.edu
159469
...

extracting authors at page #60.
Bulent Sankur
https://scholar.google.com/citations?hl=en&user=z9FUD8QAAAAJ
Professor of Electrical and Electronics Engineering, Bogazici University
Verified email at boun.edu.tr
16953
'''

В качестве альтернативы вы можете добиться того же, используя API профилей Google Scholar от SerpApi. Это платный API с бесплатным планом.

Разница в том, что вам нужно только перебрать полученный словарь и получить нужные данные без необходимости разбираться, как масштабировать количество запросов, как обходить блокировки от поисковых систем.

Пример кода для интеграции:

      from serpapi import GoogleSearch
import os, json
from urllib.parse import urlsplit, parse_qsl


def serpapi_scrape_all_authors():
    params = {
        "api_key": os.getenv("API_KEY"),      # SerpApi API key
        "engine": "google_scholar_profiles",  # profile results search engine
        "mauthors": "blizzard",               # search query
    }
    search = GoogleSearch(params)

    profile_results_data = []

    profiles_is_present = True
    while profiles_is_present:
        profile_results = search.get_dict()

        for profile in profile_results["profiles"]:

            print(f'Currently extracting {profile["name"]} with {profile["author_id"]} ID.')

            thumbnail = profile["thumbnail"]
            name = profile["name"]
            link = profile["link"]
            author_id = profile["author_id"]
            affiliations = profile["affiliations"]
            email = profile.get("email")
            cited_by = profile.get("cited_by")
            interests = profile.get("interests")

            profile_results_data.append({
                "thumbnail": thumbnail,
                "name": name,
                "link": link,
                "author_id": author_id,
                "email": email,
                "affiliations": affiliations,
                "cited_by": cited_by,
                "interests": interests
            })

            if "next" in profile_results["pagination"]:
                # split URL in parts as a dict() and update search "params" variable to a new page
                search.params_dict.update(dict(parse_qsl(urlsplit(profile_results["pagination"]["next"]).query)))
            else:
                profiles_is_present = False

    return profile_results_data

print(json.dumps(serpapi_scrape_all_authors(), indent=2))


# output:
'''
Currently extracting Adam Lobel with _xwYD2sAAAAJ ID.
...
Currently extracting Vladimir Ivanov with rddjbZcAAAAJ ID.

[
  {
    "thumbnail": "https://scholar.googleusercontent.com/citations?view_op=small_photo&user=_xwYD2sAAAAJ&citpid=3",
    "name": "Adam Lobel",
    "link": "https://scholar.google.com/citations?hl=en&user=_xwYD2sAAAAJ",
    "author_id": "_xwYD2sAAAAJ",
    "email": "Verified email at AdamLobel.com",
    "affiliations": "Blizzard Entertainment",
    "cited_by": 2980,
    "interests": [
      {
        "title": "Gaming",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Agaming",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:gaming"
      },
      {
        "title": "Emotion regulation",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Aemotion_regulation",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:emotion_regulation"
      }
    ]
  } ... other results
  {
    "thumbnail": "https://scholar.google.com/citations/images/avatar_scholar_56.png",
    "name": "Vladimir Ivanov",
    "link": "https://scholar.google.com/citations?hl=en&user=rddjbZcAAAAJ",
    "author_id": "rddjbZcAAAAJ",
    "email": null,
    "affiliations": "Blizzard Entertainment",
    "cited_by": null,
    "interests": [
      {
        "title": "Machine Learning",
        "serpapi_link": "https://serpapi.com/search.json?after_author=V8JcAPb___8J&engine=google_scholar_profiles&hl=en&mauthors=label%3Amachine_learning",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:machine_learning"
      },
      {
        "title": "Reinforcement Learning",
        "serpapi_link": "https://serpapi.com/search.json?after_author=V8JcAPb___8J&engine=google_scholar_profiles&hl=en&mauthors=label%3Areinforcement_learning",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:reinforcement_learning"
      },
      {
        "title": "Computer Vision",
        "serpapi_link": "https://serpapi.com/search.json?after_author=V8JcAPb___8J&engine=google_scholar_profiles&hl=en&mauthors=label%3Acomputer_vision",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:computer_vision"
      },
      {
        "title": "Cinematics",
        "serpapi_link": "https://serpapi.com/search.json?after_author=V8JcAPb___8J&engine=google_scholar_profiles&hl=en&mauthors=label%3Acinematics",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:cinematics"
      }
    ]
  }
]
'''

Если вы хотите проанализировать исторические органические результаты из Google Scholar, есть специальная запись в блоге « Очистить исторические результаты Google Scholar с использованием моего Python ».

Отказ от ответственности, я работаю в SerpApi.

Другие вопросы по тегам