Skip to content

Commit 63cb0fb

Browse files
committed
First Commit
1 parent 9e51ade commit 63cb0fb

16 files changed

+1747
-0
lines changed

Chatbot.py

Lines changed: 532 additions & 0 deletions
Large diffs are not rendered by default.

Documents/Advanced_Facts_Octopus.pdf

4.22 KB
Binary file not shown.
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
2+
Advanced Information on Velociraptors (Velociraptor mongoliensis)
3+
4+
Taxonomy & Classification:
5+
- Kingdom: Animalia
6+
- Phylum: Chordata
7+
- Class: Reptilia
8+
- Clade: Dinosauria
9+
- Order: Saurischia
10+
- Suborder: Theropoda
11+
- Family: Dromaeosauridae
12+
- Genus: Velociraptor
13+
- Species: V. mongoliensis
14+
15+
Physical Characteristics:
16+
- Velociraptors were small to medium-sized dromaeosaurid theropods, measuring about 2 meters (6.6 ft) long and about 0.5 meters (1.6 ft) tall at the hip.
17+
- They weighed around 15–20 kg (33–44 lbs), roughly the size of a large turkey.
18+
- They had long, stiff tails used for balance and agility, and a large sickle-shaped claw on each hind foot—thought to be their primary killing tool.
19+
20+
Feathers and Evolutionary Significance:
21+
- Fossil evidence, particularly from specimens found in Mongolia and China, strongly suggests that velociraptors had feathers.
22+
- Quill knobs (attachment points for feathers) were found on the forearms of some fossils, indicating a bird-like appearance.
23+
- Velociraptors represent a transitional species between non-avian dinosaurs and modern birds, strengthening the evolutionary link.
24+
25+
Behavior and Hunting:
26+
- While often depicted hunting in packs (popularized by films), there is limited direct fossil evidence to support this theory.
27+
- Their brain-to-body size ratio suggests high intelligence among dinosaurs, potentially making them adept at complex behaviors like coordinated hunting or problem-solving.
28+
- Their strong hind legs and curved claws suggest a predatory lifestyle, capable of pouncing and pinning prey rather than chasing it over long distances.
29+
30+
Diet:
31+
- Velociraptors were carnivorous and likely fed on small reptiles, mammals, and possibly carrion.
32+
- One famous fossil (the "Fighting Dinosaurs" specimen) captures a velociraptor locked in combat with a Protoceratops, showing its active predatory nature.
33+
34+
Habitat and Fossil Record:
35+
- Lived during the Late Cretaceous Period (~75–71 million years ago).
36+
- Fossils have been found primarily in the Gobi Desert (Mongolia and northern China).
37+
- The arid environment suggests they may have been desert-adapted predators.
38+
39+
Pop Culture vs. Science:
40+
- Pop culture often exaggerates velociraptor size, confusing them with Deinonychus, a larger relative.
41+
- In reality, the real Velociraptor was feathered, much smaller, and more bird-like than lizard-like.
42+
43+
Significance in Paleontology:
44+
- Velociraptors are pivotal in studying the dinosaur-bird connection.
45+
- Their anatomy and preserved features continue to offer insights into the evolution of flight, intelligence, and social behavior among theropods.

Documents/Dire_Wolf.docx

14.8 KB
Binary file not shown.
Binary file not shown.

Gen_AI.py

Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
import os
2+
import fitz
3+
import base64
4+
import docx2txt
5+
import streamlit as st
6+
import nltk
7+
import json
8+
from datetime import datetime
9+
from concurrent.futures import ThreadPoolExecutor
10+
from langchain.text_splitter import RecursiveCharacterTextSplitter
11+
from langchain_huggingface import HuggingFaceEmbeddings
12+
from langchain_community.vectorstores import FAISS
13+
from langchain_community.llms import CTransformers
14+
from langchain.chains import RetrievalQA
15+
from langchain.prompts import PromptTemplate
16+
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
17+
from rouge_score import rouge_scorer
18+
import torch
19+
import time # Importing time for response tracking
20+
21+
# --- Initial downloads ---
22+
nltk.download('punkt', quiet=True)
23+
nltk.download('wordnet', quiet=True)
24+
25+
# --- Session State ---
26+
for key in ['llm', 'vectorstore', 'documents_processed', 'ready_to_ask']:
27+
if key not in st.session_state:
28+
st.session_state[key] = None if key == 'llm' else False
29+
30+
st.set_page_config(page_title="💬 AskDocs AI", layout="wide")
31+
32+
# --- File Extraction ---
33+
def extract_text(file):
34+
ext = os.path.splitext(file.name)[1].lower()
35+
try:
36+
if ext == '.pdf':
37+
return "".join([p.get_text() for p in fitz.open(stream=file.read(), filetype="pdf")])
38+
elif ext == '.docx':
39+
return docx2txt.process(file)
40+
elif ext == '.txt':
41+
return file.getvalue().decode('utf-8')
42+
else:
43+
st.error(f"Unsupported file type: {ext}")
44+
except Exception as e:
45+
st.warning(f"Error processing {file.name}: {e}")
46+
return ""
47+
48+
def process_documents(files):
49+
with ThreadPoolExecutor() as executor:
50+
texts = list(executor.map(extract_text, files))
51+
return "\n\n".join([t for t in texts if t])
52+
53+
# --- Embedding ---
54+
@st.cache_resource
55+
def get_embeddings(device):
56+
return HuggingFaceEmbeddings(
57+
model_name="D:/Projects/GEN-AI/sentence_transformers/all-MiniLM-L6-v2",
58+
model_kwargs={"device": device}
59+
)
60+
61+
# --- Load LLM ---
62+
@st.cache_resource
63+
def load_llm():
64+
model_path = "D:/Projects/GEN-AI/Model/llama-2-7b-chat.Q4_K_M.gguf"
65+
if not os.path.exists(model_path):
66+
st.error(f"Model file not found at {model_path}.")
67+
return None
68+
return CTransformers(
69+
model=model_path,
70+
model_type="llama",
71+
config={
72+
"max_new_tokens": 100, # Reduced token generation limit
73+
"temperature": 0.7,
74+
"top_p": 0.95,
75+
"repetition_penalty": 1.1,
76+
"threads": 8, # Multi-threading for faster inference
77+
"batch_size": 8 # Increase batch size for better throughput
78+
}
79+
)
80+
81+
def get_qa_prompt_template():
82+
return PromptTemplate(
83+
template="You are an assistant answering questions based on document content.\nContext: {context}\nQuestion: {question}\nAnswer:",
84+
input_variables=["context", "question"]
85+
)
86+
87+
# --- Evaluation ---
88+
def calculate_metrics(pred, truth):
89+
smoother = SmoothingFunction().method1
90+
bleu = sentence_bleu([truth.split()], pred.split(), smoothing_function=smoother)
91+
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
92+
rouge = scorer.score(truth, pred)
93+
return {
94+
"BLEU": round(bleu, 4),
95+
"ROUGE-1": round(rouge['rouge1'].fmeasure, 4),
96+
"ROUGE-L": round(rouge['rougeL'].fmeasure, 4)
97+
}
98+
99+
def log_evaluation(q, a, t, m):
100+
with open("evaluation_log.json", "a", encoding="utf-8") as f:
101+
json.dump({"timestamp": str(datetime.now()), "question": q, "answer": a, "truth": t, "metrics": m}, f)
102+
f.write("\n")
103+
104+
# --- Aggregating Evaluation Stats ---
105+
def get_evaluation_stats():
106+
try:
107+
with open("evaluation_log.json", "r", encoding="utf-8") as f:
108+
logs = [json.loads(line) for line in f.readlines()]
109+
110+
if not logs:
111+
return None
112+
113+
# Aggregate metrics
114+
bleu_scores = []
115+
rouge1_scores = []
116+
rougeL_scores = []
117+
118+
for log in logs:
119+
metrics = log['metrics']
120+
bleu_scores.append(metrics['BLEU'])
121+
rouge1_scores.append(metrics['ROUGE-1'])
122+
rougeL_scores.append(metrics['ROUGE-L'])
123+
124+
return {
125+
"average_bleu": round(sum(bleu_scores) / len(bleu_scores), 4),
126+
"average_rouge1": round(sum(rouge1_scores) / len(rouge1_scores), 4),
127+
"average_rougeL": round(sum(rougeL_scores) / len(rougeL_scores), 4),
128+
"total_evaluations": len(logs)
129+
}
130+
except Exception as e:
131+
st.warning(f"Error in aggregating evaluation stats: {e}")
132+
return None
133+
134+
# --- Optimization Suggestions ---
135+
def generate_optimization_suggestions(stats):
136+
suggestions = []
137+
138+
# BLEU score improvement
139+
if stats['average_bleu'] < 0.4:
140+
suggestions.append("Consider fine-tuning the model on a domain-specific corpus to improve response relevance.")
141+
if stats['average_rouge1'] < 0.5:
142+
suggestions.append("Increase retrieval depth (k=4) to gather more context for better answers.")
143+
144+
# Performance-related
145+
if stats['average_rougeL'] < 0.5:
146+
suggestions.append("Reduce chunk size for more granular context (currently 400 tokens) or overlap (80 tokens).")
147+
148+
return suggestions
149+
150+
# --- App layout ---
151+
st.title("AskDocs AI")
152+
st.subheader("*From documents to decisions — powered by AI, secured locally.*")
153+
154+
# --- Sidebar ---
155+
with st.sidebar:
156+
st.header("*Upload your Document!*")
157+
uploaded_files = st.file_uploader("*Upload PDF, DOCX, or TXT files*", type=["pdf", "docx", "txt"], accept_multiple_files=True)
158+
159+
if uploaded_files and st.button("Start The Fun!"):
160+
with st.spinner("Processing documents..."):
161+
all_text = process_documents(uploaded_files)
162+
if all_text:
163+
splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=80)
164+
chunks = splitter.split_text(all_text)
165+
st.info(f"Processed {len(chunks)} chunks from {len(uploaded_files)} files.")
166+
device = "cuda" if torch.cuda.is_available() else "cpu"
167+
embeddings = get_embeddings(device)
168+
vectordb = FAISS.from_texts(chunks, embedding=embeddings)
169+
vectordb.save_local("faiss_index")
170+
st.session_state['vectorstore'] = vectordb
171+
st.session_state['documents_processed'] = True
172+
st.success("Documents processed and vector DB created.")
173+
174+
# --- Load FAISS if exists ---
175+
if not st.session_state['documents_processed'] and os.path.exists("faiss_index"):
176+
device = "cuda" if torch.cuda.is_available() else "cpu"
177+
embeddings = get_embeddings(device)
178+
vectordb = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
179+
st.session_state['vectorstore'] = vectordb
180+
st.session_state['documents_processed'] = True
181+
182+
# --- QA Section ---
183+
if st.session_state['documents_processed'] and st.session_state['vectorstore']:
184+
st.subheader("Ask a Question Based on the Documents 📄💡")
185+
question = st.text_input("Enter your question:")
186+
187+
if question:
188+
if st.session_state.llm is None:
189+
st.session_state.llm = load_llm()
190+
191+
if st.session_state.llm:
192+
retriever = st.session_state.vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 2})
193+
qa_chain = RetrievalQA.from_chain_type(
194+
llm=st.session_state.llm,
195+
retriever=retriever,
196+
chain_type="stuff",
197+
chain_type_kwargs={"prompt": get_qa_prompt_template()}
198+
)
199+
200+
# Track response time
201+
start_time = time.time()
202+
with st.spinner("Thinking... 🤔"):
203+
result = qa_chain.invoke({"query": question})
204+
answer = result.get("result", result)
205+
end_time = time.time()
206+
207+
response_time = end_time - start_time
208+
st.session_state['ready_to_ask'] = True
209+
210+
st.markdown("### 🤖 Answer:")
211+
st.success(answer)
212+
st.write(f"⏱ Response Time: {response_time:.2f} seconds") # Log response time
213+
214+
if st.checkbox("Enable Evaluation Mode"):
215+
ground_truth = st.text_area("Enter the Ground Truth Answer")
216+
if ground_truth and st.button("Evaluate"):
217+
metrics = calculate_metrics(answer, ground_truth)
218+
log_evaluation(question, answer, ground_truth, metrics)
219+
st.subheader("📊 Evaluation Metrics")
220+
st.json(metrics)
221+
222+
stats = get_evaluation_stats()
223+
if stats:
224+
st.subheader("📈 Aggregated Stats")
225+
st.json(stats)
226+
227+
suggestions = generate_optimization_suggestions(stats)
228+
st.subheader("🛠 Optimization Suggestions")
229+
for tip in suggestions:
230+
st.markdown(f"- {tip}")
231+
232+
# --- Footer Info ---
233+
with st.sidebar:
234+
st.markdown("---")
235+
st.markdown("""
236+
### How to use:
237+
1. Upload PDF, DOCX, or TXT documents
238+
2. Click 'Start The Fun!'
239+
3. Ask questions about your documents
240+
4. (Optional) Enable evaluation mode for answer quality
241+
242+
### About:
243+
This app uses a locally hosted LLaMA 2 model via CTransformers with FAISS for retrieval.
244+
It evaluates responses using BLEU and ROUGE metrics.
245+
""")

0 commit comments

Comments
 (0)