как получить доступ к содержимому PDF-файла, используя номер страницы в javascript langchain
Я создаю чат-бота-помощника в формате PDF, используя next js, langchain и pincone DB. Я могу получить контент идеально, но когда я пытаюсь получить доступ к содержимому страницы, PDF-файл говорит, что не может получить доступ к содержимому страницы. Мне нужно получить доступ к содержимому страницы, используя номер страницы
Для этого я извлек все страницы PDF-файла с помощью pdfjs, поместил метаданные на каждую страницу в виде номера страницы и использовал другую цепочку llmchain для получения результатов, но все еще сталкиваюсь с той же проблемой.
это вектор-store.ts
try {
const embeddings = new OpenAIEmbeddings();
const index = client.Index(env.PINECONE_INDEX_NAME);
// Add metadata to each document
const docsWithMetadata = docs.map((doc, index) => {
return {
...doc,
metadata: {
page_number: index, // Assuming page numbers start from 1
},
};
});
// Embed the PDF documents and store them in the vectorStore
await PineconeStore.fromDocuments(docsWithMetadata, embeddings, {
pineconeIndex: index,
textKey: "text",
});
} catch (error) {
console.log("error ", error);
throw new Error("Failed to load your docs !");
}
}
export async function getVectorStore(client: PineconeClient, pageFilter: string) {
try {
const embeddings = new OpenAIEmbeddings();
const index = client.Index(env.PINECONE_INDEX_NAME);
const pageFilterArray = JSON.parse(pageFilter);
const filter = pageFilterArray.length
? { page_number: { $in: pageFilterArray } }
: undefined;
console.log("The metadata filter sent ", filter);
const vectorStore = await PineconeStore.fromExistingIndex(embeddings, {
pineconeIndex: index,
textKey: "text",
filter,
});
return vectorStore;
} catch (error) {
console.log("error ", error);
throw new Error("Something went wrong while getting vector store !");
}
}
это мой шаблон подсказки
// Creates a standalone question from the chat-history and the current question
//@ts-nocheck
import {allPages, data} from "../scripts/output"
export const STANDALONE_QUESTION_TEMPLATE = `Below is a summary of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base.
Generate a search query based on the conversation and the new question.
Chat History:
{chat_history}
Question:
{question}
Search query:`;
// Actual question you ask the chat and send the response to client
export const QA_TEMPLATE = `You are an enthusiastic AI coding assistant.
Answer with the facts listed in the context below. If there isn't enough information below, say you don't know.
{context}
Question: {question}
Helpful answer in markdown`;
export const METADATA_GENERATOR_TEMPLATE =`
You are an expert text classifier. Your job is to generate an array of strings that are within the "context" that best match with the Input question.
First check for text-similarity match, if not fallback to semantic matching.
Context:
"""
${data}
"""
Question: {question}
Helpful answer in markdown:
Examples for QA on page page number:
Input: Can you summarize the document on page number?
Output: ["page number"]
`
это данные.тс
export const data = [
{
metadata: { page_number: 1 },
pageContent:
"Installation Guide Atlas Bio Series Access Control Panels www.zktecousa.com Ver 2.0 ",
},
{
metadata: { page_number: 2 },
pageContent:
"2 Atlas Bio Series Access Control Panels Installation Guide CONTENTS What’s in the Box ....................................................................... 3 Optional Accessories
},
{
metadata: { page_number: 3 },
pageContent:
"3 Atlas Bio Series Access Control Panels Installation Guide What’s in the Box 4 Diodes 4 Screws & Anchors 1 Screwdriver COMM RUN GND PWR NC COM NO SEN GND BUT NC COM NO SEN GND BUT NC
},
{
metadata: { page_number: 4 },
pageContent:
"4 Atlas Bio Series Access Control Panels Installation Guide Optional Accessories Wiegand
},
{
metadata: { page_number: 5 },
pageContent:
"5 Atlas Bio Series Access Control Panels Installation Guide Safety Precautions The following precautions are to keep user’s safe and prevent any damage. Please read carefully
},
это мой langchain.ts
export async function callChain({ question, chatHistory }: callChainArgs) {
try {
const sanitizedQuestion = question.trim().replaceAll("\n", " ");
console.log(sanitizedQuestion, "Question");
const pineconeClient = await getPineconeClient();
const { stream, handlers } = LangChainStream({
experimental_streamData: true,
});
const data = new experimental_StreamData();
const prompt = PromptTemplate.fromTemplate(METADATA_GENERATOR_TEMPLATE);
const chainA = new LLMChain({ llm: nonStreamingModel, prompt });
// The result is an object with a `text` property.
const metadata = await chainA.call({ question: sanitizedQuestion });
console.log("The resA ", metadata);
const vectorStore = await getVectorStore(pineconeClient, metadata.text);
// Create the ConversationalRetrievalQAChain with the specified models and options
const chain = ConversationalRetrievalQAChain.fromLLM(
streamingModel,
vectorStore.asRetriever(),
{
qaTemplate: QA_TEMPLATE,
questionGeneratorTemplate: STANDALONE_QUESTION_TEMPLATE,
returnSourceDocuments: true,
memory: new BufferMemory({
memoryKey: "chat_history",
inputKey: "question", // The key for the input to the chain
outputKey: "text", // The key for the final conversational output of the chain
returnMessages: true, // If using with a chat model (e.g. gpt-3.5 or gpt-4)
}),
questionGeneratorChainOptions: {
llm: nonStreamingModel,
},
}
);
// Call the chain with the sanitized question and chat history
chain
.call(
{
question: sanitizedQuestion,
chat_history: chatHistory,
},
[handlers]
)
.then(async (res) => {
// Extract the source documents from the response
const sourceDocuments = res?.sourceDocuments;
// Get the page content of the first two documents
const firstTwoDocuments = sourceDocuments.slice(0, 2);
const pageContents = firstTwoDocuments.map(
({ pageContent }: { pageContent: string }) => pageContent
);
// Append the page contents to the experimental_StreamData object
data.append({
sources: pageContents,
});
// Close the experimental_StreamData object
data.close();
});
// Return the streaming text response
return new StreamingTextResponse(stream, {}, data);
} catch (e) {
console.error(e);
throw new Error("Call chain method failed to execute successfully!!");
}
}
Я чувствую, что, скорее всего, я ошибаюсь с шаблоном подсказки или ланчейном.