I wanted to make a web app that uses llama-index to answer queries using RAG from specific documents. I have locally set up Llama3.2-1B-instruct llm and using that locally to create indexes of the documents and then using a local embedding model from huggingface to generate the response. But the problem is i am not sure how to host it and also make it more efficient. I used the HFinference feature in which they give you an api url to the uploaded llm and without loading ot your system locally you can use that llm. It works fine for normal chat and is much faster as well, but i am not sure how to index my own documents in order to get the response i want. I need some help with that.
This Below is my code using the local llm which works fine but slower
import logging
import sys
import dotenv
import os
dotenv.load_dotenv()
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
import torch
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings
from llama_index.core import PromptTemplate
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import StorageContext, load_index_from_storage
documents = SimpleDirectoryReader('data').load_data()
system_prompt = """<|SYSTEM|> #
You are my assistant who gives me details about my courses which i have given in the form of documents of .xlsx format
"""
import torch
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")
llm = HuggingFaceLLM(
context_window=4096,
max_new_tokens=128,
generate_kwargs={"temperature": 0.7, "do_sample": False},
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
tokenizer_name="meta-llama/Llama-3.2-1B-Instruct",
model_name="meta-llama/Llama-3.2-1B-Instruct",
device_map="auto",
stopping_ids=[50278, 50279, 50277, 1, 0],
tokenizer_kwargs={"max_length": 2048}, # Reduced to fit inside memory
)
Settings.llm = llm
Settings.chunk_size = 1024
llm.generate_kwargs["pad_token_id"] = 0 # Or the ID for <|PAD|> if available
# Use a local embedding model instead of OpenAI
local_embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Set the embedding model in the Settings
Settings.embed_model = local_embed_model
index = VectorStoreIndex.from_documents(documents)
def query_engine_response(query: str):
query_engine = index.as_query_engine(streaming=True, similarity_top_k=1)
response = query_engine.query(query)
return str(response)
And below is the code using HF inference (the uploaded LLM one from huggingface which i am not sure how to integrate with indexing feature
import requests
API_URL = "https://router.huggingface.co/novita/v3/openai/chat/completions"
headers = {
"Authorization": "Bearer hf_XXXXXXXXXXXXXXXXXXXXXXXX",
}
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
response = query({
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
],
"model": "meta-llama/llama-3.2-1b-instruct"
})
print(response['choices'][0]['message']['content'])