我正在从PDF文档中提取文本并将其加载到Azure Cognitive Search以使用RAG方法。不幸的是,这不起作用。我收到错误消息
HttpResponseError: () The request is invalid. Details: The property 'content' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.
Code:
Message: The request is invalid. Details: The property 'content' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.
字符串
我想做的是
1.通过pymupdf - works从pdf中提取文本
1.将其上传到Azure Vector搜索,作为具有vectors和metdata filename``的嵌入 1.通过ChatGPT模型查询 除了错误,我想添加到这个
document对象的元数据信息
filename`,但也不知道如何扩展这个.
我的代码:
!pip install cohere tiktoken
!pip install openai==0.28.1
!pip install pymupdf
!pip install azure-storage-blob azure-identity
!pip install azure-search-documents --pre --upgrade
!pip install langchain
import fitz
import time
import uuid
import os
import openai
from PIL import Image
from io import BytesIO
from IPython.display import display
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import AzureChatOpenAI
from langchain.vectorstores import AzureSearch
from langchain.docstore.document import Document
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from google.colab import drive
OPENAI_API_BASE = "https://xxx.openai.azure.com"
OPENAI_API_KEY = "xxx"
OPENAI_API_VERSION = "2023-05-15"
openai.api_type = "azure"
openai.api_key = OPENAI_API_KEY
openai.api_base = OPENAI_API_BASE
openai.api_version = OPENAI_API_VERSION
AZURE_COGNITIVE_SEARCH_SERVICE_NAME = "https://xxx.search.windows.net"
AZURE_COGNITIVE_SEARCH_API_KEY = "xxx"
AZURE_COGNITIVE_SEARCH_INDEX_NAME = "test"
llm = AzureChatOpenAI(deployment_name="gpt35", openai_api_key=OPENAI_API_KEY, openai_api_base=OPENAI_API_BASE, openai_api_version=OPENAI_API_VERSION)
embeddings = OpenAIEmbeddings(deployment_id="ada002", chunk_size=1, openai_api_key=OPENAI_API_KEY, openai_api_base=OPENAI_API_BASE, openai_api_version=OPENAI_API_VERSION)
acs = AzureSearch(azure_search_endpoint=AZURE_COGNITIVE_SEARCH_SERVICE_NAME,
azure_search_key = AZURE_COGNITIVE_SEARCH_API_KEY,
index_name = AZURE_COGNITIVE_SEARCH_INDEX_NAME,
embedding_function = embeddings.embed_query)
def generate_tokens(s, f):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_text(s)
i = 0
documents = []
for split in splits:
metadata = {}
metadata["index"] = i
metadata["file_source"] = f
i = i+1
new_doc = Document(page_content=split, metadata=metadata)
documents.append(new_doc)
#documents = text_splitter.create_documents(splits)
print (documents)
return documents
drive.mount('/content/drive')
folder = "/content/drive/.../pdf/"
page_content = ''
doc_content = ''
for filename in os.listdir(folder):
file_path = os.path.join(folder, filename)
if os.path.isfile(file_path):
print(f"Processing file: {file_path}")
doc = fitz.open(file_path)
for page in doc: # iterate the document pages
page_content += page.get_text() # get plain text encoded as UTF-8
d = generate_tokens(doc_content)
# the following line throws the error
# how can i add the chunks + filename to
# Azure Cognitive Search?
doc_content += page_content
d = generate_tokens(doc_content, file_path)
acs.add_documents(documents=d)
print(metadatas)
print("----------")
print(doc_content)
count = len(doc_content.split())
print("Number of tokens: ", count)
HttpResponseError Traceback (most recent call last)
<ipython-input-11-d9eaff7ee027> in <cell line: 10>()
31 all_texts.extend(d)
32
---> 33 acs.add_documents(documents=d)
34
35 metadatas = [{"Source": f"{i}-pl"} for i in range(len(all_texts))]
7 frames
/usr/local/lib/python3.10/dist-packages/azure/search/documents/_generated/operations/_documents_operations.py in index(self, batch, request_options, **kwargs)
1249 map_error(status_code=response.status_code, response=response, error_map=error_map)
1250 error = self._deserialize.failsafe_deserialize(_models.SearchError, pipeline_response)
-> 1251 raise HttpResponseError(response=response, model=error)
1252
1253 if response.status_code == 200:
HttpResponseError: () The request is invalid. Details: The property 'content' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.
Code:
Message: The request is invalid. Details: The property 'content' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.
型
这是我在Azure Cognitive Search中的索引:
x1c 0d1x的数据
1条答案
按热度按时间mklgxw1f1#
我现在已经解决了这个问题。你必须在Azure认知搜索中创建必要的字段。这些字段是
的数据
字段content_vector似乎包含向量。
字符串
和
的