Im learning to use Pinecone but i'm encountering an issue while performing a similarity_search operation in Pinecone. Despite successfully creating and populating the index, the search always returns null results. Here's a summary of my setup:
This is the whole code, Thank you in advance.
import os
import time
from dotenv import load_dotenv, find_dotenv
from langchain_community.document_loaders import PDFMinerLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from pinecone import ServerlessSpec
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
load_dotenv(find_dotenv(), override=True)
pdf_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'pdf_files', 'document_1.pdf')
loader = PDFMinerLoader(pdf_file_path)
document = loader.load()
embedding = OpenAIEmbeddings()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 512,
chunk_overlap = 100,
separators=[""])
chunks = text_splitter.split_documents(document)
pc = Pinecone()
index_name = 'tenders-index'
for i in pc.list_indexes().names():
pc.delete_index(i)
print('Index deleted')
if index_name not in pc.list_indexes().names():
print(f'Creating index {index_name}')
pc.create_index(
name=index_name,
dimension=1536,
metric='cosine',
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
print('Index created!')
else:
print(f'Index {index_name} already exists!')
while not pc.describe_index(index_name).status['ready']:
time.sleep(1)
vector_store = PineconeVectorStore.from_documents(
documents= chunks,
embedding=embedding,
index_name=index_name
)
query = 'The goal of this document is ...'
vstore = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embedding)
llm = ChatOpenAI(model='gpt-3.5-turbo')
docs = vstore.similarity_search(query=query, k=3)
chain = load_qa_chain(
llm = llm,
chain_type = 'stuff'
)
response = chain.run(input_documents = docs , question = query)
print(response)
print('Done!')
I’ve checked the following:
- Index exists
- Pinecone Index and OpenAIEmbedding have the same dimension
- VectorStore is generated