ООП-программирование на Python для парсинга веб-страниц и извлечения ключевых слов не работает из-за ошибок в синтаксисе кода [закрыто]
Я пытаюсь включить этот фрагмент кода в синтаксис ООП. Код хорошо работает вне схемы ООП. Мне нужно поместить неопределенные переменные, взятые из ранее вызванных функций, внутри объектов, чтобы он работал, но я не знаю, как этого добиться! Любая помощь или идеи были бы действительно полезны.
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
import textacy
from url_extractor import RecursiveScraper
import requests
from bs4 import BeautifulSoup
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
nlp = spacy.load("en_core_web_sm")
class KeywordExtractor: # this is the class
rscraper = RecursiveScraper(input()) # this is the url web scraper from another code imported (it was a class there and I imported here as a library) it asks for a url
rscraper.scrape()
print(rscraper.urls)
new= list(rscraper.urls) # here we scrape all the sub-urls from the main url
for i in new:
print(i)
def get_Text():
r = requests.get(i) # undefined variable 1
soup = BeautifulSoup(r.content, "html.parser") # here we het the content of each url separately
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
new = text.encode("utf-8")
final=new.decode("utf-8")
new1=final.lower()
s="".join([w for w in new1 if w not in string.punctuation])
return s # here we return the content as a string without punctiations
def remove_verbs():
punctuations = string.punctuation
stopwords = list(STOP_WORDS)
nlp = spacy.load('en_core_web_sm')
docx = nlp(s) # undefined variable 2
nouns1 = []
for token in docx:
if token.is_stop != True and token.is_punct != True and token.pos_ == 'VERB':
nouns1.append(token)
list_of_strings1 = [y.text for y in nouns1] # we add the verbs in the stopword list
for z in list_of_strings1:
if z not in stopwords:
stopwords.append(z)
no_stop_txt = []
for token in docx:
if token.text not in stopwords:
no_stop_txt.append(token.text)
return no_stop_txt # we append every verb of the text data in the stopword list
str1=" ".join(no_stop_txt) # turns spacy,token.token into a string
res = "".join(filter(lambda x: not x.isdigit(), str1)) # we remove digits from the string
res=[res] # turns string to list to be read from the tfidf
def compute_tfidf():
vectorizer = TfidfVectorizer(max_df=1, min_df=1, stop_words=None, use_idf=True, norm=None)
transformed_documents = vectorizer.fit_transform(res) # compute tfidf from the list with undefined variable 3
feature_names = vectorizer.get_feature_names()
if __name__=="__main__":
keywords = KeywordExtractor()
keywords.compute_tfidf # no idea what to do here and how to fill the missing parts and undefined variables!