From bfd2314f7287c91d6552c74942daf6ff402b81eb Mon Sep 17 00:00:00 2001 From: Sri Charan Thoutam Date: Mon, 13 Jan 2025 19:35:36 +0530 Subject: [PATCH] rag_chain added to rag_tutorials --- rag_tutorials/rag_chain/README.md | 56 +++++++++ rag_tutorials/rag_chain/app.py | 139 +++++++++++++++++++++++ rag_tutorials/rag_chain/requirements.txt | 9 ++ 3 files changed, 204 insertions(+) create mode 100644 rag_tutorials/rag_chain/README.md create mode 100644 rag_tutorials/rag_chain/app.py create mode 100644 rag_tutorials/rag_chain/requirements.txt diff --git a/rag_tutorials/rag_chain/README.md b/rag_tutorials/rag_chain/README.md new file mode 100644 index 0000000..4a0f1ca --- /dev/null +++ b/rag_tutorials/rag_chain/README.md @@ -0,0 +1,56 @@ +# PharmaQuery + +## Overview +PharmaQuery is an advanced Pharmaceutical Insight Retrieval System designed to help users gain meaningful insights from research papers and documents in the pharmaceutical domain. + +## PharmaQuery Architecture +![PharmaQuery-Architecture](https://github.com/user-attachments/assets/c8a2cff7-f004-415c-8b1e-5387999680b4) + +## Features +- **Natural Language Querying**: Ask complex questions about the pharmaceutical industry and get concise, accurate answers. +- **Custom Database**: Upload your own research documents to enhance the retrieval system's knowledge base. +- **Similarity Search**: Retrieves the most relevant documents for your query using AI embeddings. +- **Streamlit Interface**: User-friendly interface for queries and document uploads. + +## Technologies Used +- **Programming Language**: [Python 3.10+](https://www.python.org/downloads/release/python-31011/) +- **Framework**: [LangChain](https://www.langchain.com/) +- **Database**: [ChromaDB](https://www.trychroma.com/) +- **Models**: + - Embeddings: [Google Gemini API (embedding-001)](https://ai.google.dev/gemini-api/docs/embeddings) + - Chat: [Google Gemini API (gemini-1.5-pro)](https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-pro) +- **PDF Processing**: [PyPDFLoader](https://python.langchain.com/docs/integrations/document_loaders/pypdfloader/) +- **Document Splitter**: [SentenceTransformersTokenTextSplitter](https://python.langchain.com/api_reference/text_splitters/sentence_transformers/langchain_text_splitters.sentence_transformers.SentenceTransformersTokenTextSplitter.html) + +## Requirements +1. **Install Dependencies**: + ```bash + pip install -r requirements.txt + ``` + +2. **Set Up Environment Variables**: + Create a `.env` file in the project root directory with the following variables: + + ```bash + GOOGLE_API_KEY="your_google_gemini_api_key" + ``` + + `Note:` Replace `your_google_gemini_api_key` with actual key. + +3. **Run the Application**: + ```bash + streamlit run app.py + ``` + +4. **Use the Application**: + - Enter your query in the main interface. + - Optionally, upload research papers in the sidebar to enhance the database. + +## :mailbox: Connect With Me +handshake gif + +

+ codewithcharan + __mr.__.unique + codewithcharan +

\ No newline at end of file diff --git a/rag_tutorials/rag_chain/app.py b/rag_tutorials/rag_chain/app.py new file mode 100644 index 0000000..1c1c962 --- /dev/null +++ b/rag_tutorials/rag_chain/app.py @@ -0,0 +1,139 @@ +import os +import streamlit as st + +from langchain_google_genai import GoogleGenerativeAIEmbeddings +from langchain_chroma import Chroma +from langchain_community.document_loaders import PyPDFLoader +from langchain_text_splitters.sentence_transformers import SentenceTransformersTokenTextSplitter +from langchain_core.prompts import ChatPromptTemplate +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain_core.output_parsers import StrOutputParser +from langchain_core.runnables import RunnablePassthrough + +from dotenv import load_dotenv +load_dotenv() + +# Initialize embedding model +embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001") + +# Initialize pharma database +db = Chroma(collection_name="pharma_database", + embedding_function=embedding_model, + persist_directory='./pharma_db') + +def format_docs(docs): + return "\n\n".join(doc.page_content for doc in docs) + +def add_to_db(uploaded_files): + # Check if files are uploaded + if not uploaded_files: + st.error("No files uploaded!") + return + + for uploaded_file in uploaded_files: + # Save the uploaded file to a temporary path + temp_file_path = os.path.join("./temp", uploaded_file.name) + os.makedirs(os.path.dirname(temp_file_path), exist_ok=True) + + with open(temp_file_path, "wb") as temp_file: + temp_file.write(uploaded_file.getbuffer()) + + # Load the file using PyPDFLoader + loader = PyPDFLoader(temp_file_path) + data = loader.load() + + # Store metadata and content + doc_metadata = [data[i].metadata for i in range(len(data))] + doc_content = [data[i].page_content for i in range(len(data))] + + # Split documents into smaller chunks + st_text_splitter = SentenceTransformersTokenTextSplitter( + model_name="sentence-transformers/all-mpnet-base-v2", + chunk_size=100, + chunk_overlap=50 + ) + st_chunks = st_text_splitter.create_documents(doc_content, doc_metadata) + + # Add chunks to database + db.add_documents(st_chunks) + + # Remove the temporary file after processing + os.remove(temp_file_path) + +def run_rag_chain(query): + # Create a Retriever Object and apply Similarity Search + retriever = db.as_retriever(search_type="similarity", search_kwargs={'k': 5}) + + # Initialize a Chat Prompt Template + PROMPT_TEMPLATE = """ + You are a highly knowledgeable assistant specializing in pharmaceutical sciences. + Answer the question based only on the following context: + {context} + + Answer the question based on the above context: + {question} + + Use the provided context to answer the user's question accurately and concisely. + Don't justify your answers. + Don't give information not mentioned in the CONTEXT INFORMATION. + Do not say "according to the context" or "mentioned in the context" or similar. + """ + + prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) + + # Initialize a Generator (i.e. Chat Model) + chat_model = ChatGoogleGenerativeAI( + model="gemini-1.5-pro", + temperature=1 + ) + + # Initialize a Output Parser + output_parser = StrOutputParser() + + # RAG Chain + rag_chain = {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt_template | chat_model | output_parser + + # Invoke the Chain + response = rag_chain.invoke(query) + + return response + +def main(): + st.set_page_config(page_title="PharmaQuery", page_icon=":microscope:") + st.header("Pharmaceutical Insight Retrieval System") + + query = st.text_area( + ":bulb: Enter your query about the Pharmaceutical Industry:", + placeholder="e.g., What are the AI applications in drug discovery?" + ) + + if st.button("Submit"): + if not query: + st.warning("Please ask a question") + + else: + with st.spinner("Thinking..."): + result = run_rag_chain(query=query) + st.write(result) + + with st.sidebar: + st.title("Upload your research documents (Optional) :memo:") + pdf_docs = st.file_uploader("Enhance your query by uploading PDF files related to Pharmaceutical Sciences.", + type=["pdf"], + accept_multiple_files=True + ) + + if st.button("Submit & Process"): + if not pdf_docs: + st.warning("Please upload the file") + + else: + with st.spinner("Processing your documents..."): + add_to_db(pdf_docs) + st.success(":file_folder: Documents successfully added to the database!") + + # Sidebar Footer + st.sidebar.write("Built with ❤️ by [Charan](https://www.linkedin.com/in/codewithcharan/)") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/rag_tutorials/rag_chain/requirements.txt b/rag_tutorials/rag_chain/requirements.txt new file mode 100644 index 0000000..7b5fa46 --- /dev/null +++ b/rag_tutorials/rag_chain/requirements.txt @@ -0,0 +1,9 @@ +streamlit +langchain-google-genai +langchain-chroma +langchain-community +langchain-core +chromadb +sentence-transformers +PyPDF2 +python-dotenv