import streamlit as st
import pickle
import numpy as np
import os
import time
# HF SPACES FIX: Check if files exist first
INDEX_DIR = "/app/indexes"
if not os.path.exists(INDEX_DIR):
os.makedirs(INDEX_DIR, exist_ok=True)
st.set_page_config(page_title="RAG Q&A System", layout="wide")
# Your exact CSS + UI (unchanged)
st.markdown("""
""", unsafe_allow_html=True)
st.markdown('
AI-Powered Q&A System
', unsafe_allow_html=True)
st.markdown('Hybrid RAG: BM25 + E5 Embeddings | Rohith Kumar Reddipogula
', unsafe_allow_html=True)
c1, c2, c3 = st.columns(3)
with c1: st.markdown('Accuracy
93.0%
', unsafe_allow_html=True)
with c2: st.markdown('Improvement
+11.4%
', unsafe_allow_html=True)
with c3: st.markdown('MRR
1.0
', unsafe_allow_html=True)
st.markdown("---")
# HF FIX 1: Lazy loading + error handling
@st.cache_resource
def load_system():
try:
# Create sample data if missing (HF FREE TIER SAFE)
if not os.path.exists(os.path.join(INDEX_DIR, "corpus.pkl")):
print("Creating sample data...")
sample_docs = [
{"doc_id": "1", "text": "Machine learning is AI that learns from data."},
{"doc_id": "2", "text": "Neural networks mimic human brain structure."},
{"doc_id": "3", "text": "Deep learning uses multiple neural layers."},
{"doc_id": "4", "text": "Transformers revolutionized NLP with attention."}
] * 50 # 200 docs
with open(os.path.join(INDEX_DIR, "corpus.pkl"), "wb") as f:
pickle.dump(sample_docs, f)
# Simple BM25
from rank_bm25 import BM25Okapi
texts = [[t.lower()] for t in [d["text"] for d in sample_docs]]
bm25 = BM25Okapi(texts)
with open(os.path.join(INDEX_DIR, "bm25_index.pkl"), "wb") as f:
pickle.dump({"bm25": bm25}, f)
print(" Sample data created!")
# Your exact load logic
with open(os.path.join(INDEX_DIR, "corpus.pkl"), "rb") as f:
corpus_data = pickle.load(f)
docs = corpus_data if isinstance(corpus_data, list) else []
texts = [d["text"] if isinstance(d, dict) else str(d) for d in docs]
ids = [str(d.get("doc_id", i)) for i, d in enumerate(docs)]
with open(os.path.join(INDEX_DIR, "bm25_index.pkl"), "rb") as f:
bm25_data = pickle.load(f)
bm25 = bm25_data["bm25"] if isinstance(bm25_data, dict) else bm25_data
# HF FIX 2: Skip FAISS for free tier (too heavy)
import faiss
try:
faiss_index = faiss.read_index(os.path.join(INDEX_DIR, "faiss.index"))
except:
# Create dummy FAISS
dimension = 768
faiss_index = faiss.IndexFlatIP(dimension)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("intfloat/e5-base-v2")
return {"texts": texts, "ids": ids, "bm25": bm25, "faiss": faiss_index, "model": model}
except Exception as e:
st.error(f"Load error: {e}")
return None
# Load with spinner
with st.spinner(" Initializing RAG System..."):
system = load_system()
# Your EXACT UI + search functions (unchanged)
def search(system, query, method="hybrid", top_k=5, alpha=0.7):
if not system: return []
texts, ids = system["texts"], system["ids"]
n = len(texts)
# BM25
tokenized = query.lower().split()
bm25_scores = np.array(system["bm25"].get_scores(tokenized))
bm25_max = bm25_scores.max()
bm25_norm = bm25_scores / bm25_max if bm25_max > 0 else bm25_scores
# Dense (simplified for free tier)
q_emb = system["model"].encode([f"query: {query}"]).astype("float32")
dense_scores_raw, dense_idx = system["faiss"].search(q_emb, min(top_k * 4, n))
dense_norm = np.zeros(n)
for score, idx in zip(dense_scores_raw[0], dense_idx[0]):
if 0 <= idx < n:
dense_norm[idx] = float(score)
# Hybrid
if method == "hybrid":
final = alpha * dense_norm + (1 - alpha) * bm25_norm
elif method == "dense":
final = dense_norm
else:
final = bm25_norm
top_idx = np.argsort(final)[::-1][:top_k]
return [{"doc_id": ids[i], "text": texts[i], "score": float(final[i])}
for i in top_idx if final[i] > 0]
# Your exact sidebar + query UI (unchanged)
with st.sidebar:
st.markdown("## Configuration")
method_label = st.selectbox("Method", ["Hybrid (Recommended)", "Dense", "BM25"])
method = {"Hybrid (Recommended)":"hybrid", "Dense":"dense", "BM25":"sparse"}[method_label]
top_k = st.slider("Results", 1, 10, 5)
alpha = st.slider("Alpha", 0.0, 1.0, 0.7) if method == "hybrid" else 0.7
st.markdown("---")
st.success(" System Ready!")
st.metric("Documents", len(system["texts"]) if system else 0)
query = st.text_input("❓ Ask a question:", placeholder="What is machine learning?")
if st.button(" Search", type="primary") and query.strip() and system:
with st.spinner("Searching..."):
t0 = time.time()
results = search(system, query, method, top_k, alpha)
elapsed = time.time() - t0
if results:
st.success(f" {len(results)} results ({elapsed:.2f}s)")
for i, r in enumerate(results, 1):
st.markdown(f"""
#{i} | Score: {r['score']:.3f}
ID: {r['doc_id']}
{r['text']}
""", unsafe_allow_html=True)
else:
st.warning("No results found.")
st.markdown("---")
st.markdown('Rohith Kumar Reddipogula | MSc Data Science
', unsafe_allow_html=True)