Как расширить два вложения разных размеров?
Я пытаюсь реализовать это решение: https://www.mlq.ai/gpt-4-pinecone-website-ai-assistant/
Здесь у меня возникла проблема: «res» не определен, хорошо, я просматриваю документацию и не уверен, откуда взялось это «res».
Вот код, я подумал, что «res» может быть «ответом», как это уже определено в коде, но у меня все еще есть ошибки.
# -*- coding: utf-8 -*-
!pip install tiktoken openai pinecone-client -q
import openai
import tiktoken
import pinecone
import os
import re
import requests
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
from IPython.display import Markdown
openai.api_key = "KEY"
PINECONE_API_KEY = 'KEY'
PINECONE_API_ENV = 'ENV'
# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'
# Define root domain to crawl
domain = "domain-name.com"
full_url = "https://domain-name.com/"
# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
def __init__(self):
super().__init__()
# Create a list to store the hyperlinks
self.hyperlinks = []
# Override the HTMLParser's handle_starttag method to get the hyperlinks
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
# If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
if tag == "a" and "href" in attrs:
self.hyperlinks.append(attrs["href"])
# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
# Try to open the URL and read the HTML
try:
# Open the URL and read the HTML
with urllib.request.urlopen(url) as response:
# If the response is not HTML, return an empty list
if not response.info().get('Content-Type').startswith("text/html"):
return []
# Decode the HTML
html = response.read().decode('utf-8')
except Exception as e:
print(e)
return []
# Create the HTML Parser and then Parse the HTML to get hyperlinks
parser = HyperlinkParser()
parser.feed(html)
return parser.hyperlinks
# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
clean_links = []
for link in set(get_hyperlinks(url)):
clean_link = None
# If the link is a URL, check if it is within the same domain
if re.search(HTTP_URL_PATTERN, link):
# Parse the URL and check if the domain is the same
url_obj = urlparse(link)
if url_obj.netloc == local_domain:
clean_link = link
# If the link is not a URL, check if it is a relative link
else:
if link.startswith("/"):
link = link[1:]
elif link.startswith("#") or link.startswith("mailto:"):
continue
clean_link = "https://" + local_domain + "/" + link
if clean_link is not None:
if clean_link.endswith("/"):
clean_link = clean_link[:-1]
clean_links.append(clean_link)
# Return the list of hyperlinks that are within the same domain
return list(set(clean_links))
def crawl(url):
# Parse the URL and get the domain
local_domain = urlparse(url).netloc
# Create a queue to store the URLs to crawl
queue = deque([url])
# Create a set to store the URLs that have already been seen (no duplicates)
seen = set([url])
# Create a directory to store the text files
if not os.path.exists("text/"):
os.mkdir("text/")
if not os.path.exists("text/"+local_domain+"/"):
os.mkdir("text/" + local_domain + "/")
# Create a directory to store the csv files
if not os.path.exists("processed"):
os.mkdir("processed")
# While the queue is not empty, continue crawling
while queue:
# Get the next URL from the queue
url = queue.pop()
print(url) # for debugging and to see the progress
# Save text from the url to a <url>.txt file
with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w") as f:
# Get the text from the URL using BeautifulSoup
soup = BeautifulSoup(requests.get(url).text, "html.parser")
# Get the text but remove the tags
text = soup.get_text()
# If the crawler gets to a page that requires JavaScript, it will stop the crawl
if ("You need to enable JavaScript to run this app." in text):
print("Unable to parse page " + url + " due to JavaScript being required")
# Otherwise, write the text to the file in the text directory
f.write(text)
# Get the hyperlinks from the URL and add them to the queue
for link in get_domain_hyperlinks(local_domain, url):
if link not in seen:
queue.append(link)
seen.add(link)
crawl(full_url)
def remove_newlines(serie):
serie = serie.str.replace('\n', ' ')
serie = serie.str.replace('\\n', ' ')
serie = serie.str.replace(' ', ' ')
serie = serie.str.replace(' ', ' ')
return serie
import pandas as pd
# Create a list to store the text files
texts=[]
# Get all the text files in the text directory
for file in os.listdir("/content/text/" + domain + "/"):
# Open the file and read the text
with open("text/" + domain + "/" + file, "r") as f:
text = f.read()
# Extract the original URL from the filename
original_url = "https://" + file[:-4].replace("_", "/")
texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text, original_url))
# Create a dataframe from the list of texts
df = pd.DataFrame(texts, columns = ['fname', 'text', 'url'])
# Set the text column to be the raw text with the newlines removed
df['text'] = df.fname + ". " + remove_newlines(df.text)
df.to_csv('/content/processed/scraped.csv')
df.head()
import tiktoken
# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")
df = pd.read_csv('processed/scraped.csv', index_col=0)
df.columns = ['title', 'text', 'url']
# Tokenize the text and save the number of tokens to a new column
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
# Visualize the distribution of the number of tokens per row using a histogram
df.n_tokens.hist()
df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
df.head()
# Add an 'id' column to the DataFrame
from uuid import uuid4
df['id'] = [str(uuid4()) for _ in range(len(df))]
# Fill null values in 'title' column with 'No Title'
df['title'] = df['title'].fillna('No Title')
print(df)
# Define index name
index_name = 'INDEX_NAME'
# Initialize connection to Pinecone
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
# Connect to the index and view index stats
index = pinecone.Index(index_name)
index.describe_index_stats()
from tqdm.auto import tqdm
batch_size = 100 # how many embeddings we create and insert at once
# Convert the DataFrame to a list of dictionaries
chunks = df.to_dict(orient='records')
# Upsert embeddings into Pinecone in batches of 100
for i in tqdm(range(0, len(chunks), batch_size)):
i_end = min(len(chunks), i+batch_size)
meta_batch = chunks[i:i_end]
ids_batch = [x['id'] for x in meta_batch]
embeds = [x['embeddings'] for x in meta_batch]
meta_batch = [{
'title': x['title'],
'text': x['text'],
'url': x['url']
} for x in meta_batch]
to_upsert = list(zip(ids_batch, embeds, meta_batch))
index.upsert(vectors=to_upsert)
embed_model = "text-embedding-ada-002"
user_input = "Write a financial article about the 5 Steps to Avoid Retirement Hell"
embed_query = openai.Embedding.create(
input=user_input,
engine=embed_model
)
query_embeds = embed_query['data'][0]['embedding']
res = index.query(query_embeds, top_k=5, include_metadata=True)
contexts = [item['metadata']['text'] for item in res['matches']]
augmented_query = "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+user_input
# system message to assign role the model
system_msg = f"""You are a helpul machine learning assistant and tutor. Answer questions based on the context provided, or say I don't know.".
"""
chat = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": system_msg},
{"role": "user", "content": augmented_query}
]
)
display(Markdown(chat['choices'][0]['message']['content']))
Я пытался использовать что-то, что уже было определено, но не сработало. Мы пытаемся дополнить наш запрос, объединив полученный контекст (наши HTML-данные в качестве внедрения) и исходный запрос (наш вопрос к GPT).