From bfd2314f7287c91d6552c74942daf6ff402b81eb Mon Sep 17 00:00:00 2001
From: Sri Charan Thoutam <thoutamsricharan@gmail.com>
Date: Mon, 13 Jan 2025 19:35:36 +0530
Subject: [PATCH] rag_chain added to rag_tutorials

---
 rag_tutorials/rag_chain/README.md        |  56 +++++++++
 rag_tutorials/rag_chain/app.py           | 139 +++++++++++++++++++++++
 rag_tutorials/rag_chain/requirements.txt |   9 ++
 3 files changed, 204 insertions(+)
 create mode 100644 rag_tutorials/rag_chain/README.md
 create mode 100644 rag_tutorials/rag_chain/app.py
 create mode 100644 rag_tutorials/rag_chain/requirements.txt
diff --git a/rag_tutorials/rag_chain/README.md b/rag_tutorials/rag_chain/README.md
new file mode 100644
index 0000000..4a0f1ca
--- /dev/null
+++ b/rag_tutorials/rag_chain/README.md
@@ -0,0 +1,56 @@
+# PharmaQuery
+
+## Overview
+PharmaQuery is an advanced Pharmaceutical Insight Retrieval System designed to help users gain meaningful insights from research papers and documents in the pharmaceutical domain.
+
+## PharmaQuery Architecture
+![PharmaQuery-Architecture](https://github.com/user-attachments/assets/c8a2cff7-f004-415c-8b1e-5387999680b4)
+
+## Features
+- **Natural Language Querying**: Ask complex questions about the pharmaceutical industry and get concise, accurate answers.
+- **Custom Database**: Upload your own research documents to enhance the retrieval system's knowledge base.
+- **Similarity Search**: Retrieves the most relevant documents for your query using AI embeddings.
+- **Streamlit Interface**: User-friendly interface for queries and document uploads.
+
+## Technologies Used
+- **Programming Language**: [Python 3.10+](https://www.python.org/downloads/release/python-31011/)
+- **Framework**: [LangChain](https://www.langchain.com/)
+- **Database**: [ChromaDB](https://www.trychroma.com/)
+- **Models**:
+  - Embeddings: [Google Gemini API (embedding-001)](https://ai.google.dev/gemini-api/docs/embeddings)
+  - Chat: [Google Gemini API (gemini-1.5-pro)](https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-pro)
+- **PDF Processing**: [PyPDFLoader](https://python.langchain.com/docs/integrations/document_loaders/pypdfloader/)
+- **Document Splitter**: [SentenceTransformersTokenTextSplitter](https://python.langchain.com/api_reference/text_splitters/sentence_transformers/langchain_text_splitters.sentence_transformers.SentenceTransformersTokenTextSplitter.html)
+
+## Requirements
+1. **Install Dependencies**:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+2. **Set Up Environment Variables**:
+   Create a `.env` file in the project root directory with the following variables:
+
+    ```bash
+    GOOGLE_API_KEY="your_google_gemini_api_key"
+    ```
+
+    `Note:` Replace `your_google_gemini_api_key` with actual key.
+
+3. **Run the Application**:
+   ```bash
+   streamlit run app.py
+   ```
+
+4. **Use the Application**:
+   - Enter your query in the main interface.
+   - Optionally, upload research papers in the sidebar to enhance the database.
+
+## :mailbox: Connect With Me
+<img align="right" src="https://media.giphy.com/media/2HtWpp60NQ9CU/giphy.gif" alt="handshake gif" width="150">
+
+<p align="left">
+  <a href="https://linkedin.com/in/codewithcharan" target="blank"><img align="center" src="https://raw.githubusercontent.com/rahuldkjain/github-profile-readme-generator/master/src/images/icons/Social/linked-in-alt.svg" alt="codewithcharan" height="30" width="40" style="margin-right: 10px" /></a>
+  <a href="https://instagram.com/joyboy._.ig" target="blank"><img align="center" src="https://raw.githubusercontent.com/rahuldkjain/github-profile-readme-generator/master/src/images/icons/Social/instagram.svg" alt="__mr.__.unique" height="30" width="40" /></a>
+  <a href="https://twitter.com/Joyboy_x_" target="blank"><img align="center" src="https://raw.githubusercontent.com/rahuldkjain/github-profile-readme-generator/master/src/images/icons/Social/twitter.svg" alt="codewithcharan" height="30" width="40" style="margin-right: 10px" /></a>
+</p>
\ No newline at end of file
diff --git a/rag_tutorials/rag_chain/app.py b/rag_tutorials/rag_chain/app.py
new file mode 100644
index 0000000..1c1c962
--- /dev/null
+++ b/rag_tutorials/rag_chain/app.py
@@ -0,0 +1,139 @@
+import os
+import streamlit as st
+
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+from langchain_chroma import Chroma
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_text_splitters.sentence_transformers import SentenceTransformersTokenTextSplitter
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+
+from dotenv import load_dotenv
+load_dotenv()
+
+# Initialize embedding model
+embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+
+# Initialize pharma database
+db = Chroma(collection_name="pharma_database",
+            embedding_function=embedding_model,
+            persist_directory='./pharma_db')
+
+def format_docs(docs):
+    return "\n\n".join(doc.page_content for doc in docs)
+
+def add_to_db(uploaded_files):
+    # Check if files are uploaded
+    if not uploaded_files:
+        st.error("No files uploaded!")
+        return
+
+    for uploaded_file in uploaded_files:
+        # Save the uploaded file to a temporary path
+        temp_file_path = os.path.join("./temp", uploaded_file.name)
+        os.makedirs(os.path.dirname(temp_file_path), exist_ok=True)
+
+        with open(temp_file_path, "wb") as temp_file:
+            temp_file.write(uploaded_file.getbuffer())
+
+        # Load the file using PyPDFLoader
+        loader = PyPDFLoader(temp_file_path)
+        data = loader.load()
+
+        # Store metadata and content
+        doc_metadata = [data[i].metadata for i in range(len(data))]
+        doc_content = [data[i].page_content for i in range(len(data))]
+
+        # Split documents into smaller chunks
+        st_text_splitter = SentenceTransformersTokenTextSplitter(
+            model_name="sentence-transformers/all-mpnet-base-v2",
+            chunk_size=100,
+            chunk_overlap=50
+        )
+        st_chunks = st_text_splitter.create_documents(doc_content, doc_metadata)
+
+        # Add chunks to database
+        db.add_documents(st_chunks)
+
+        # Remove the temporary file after processing
+        os.remove(temp_file_path)
+
+def run_rag_chain(query):
+    # Create a Retriever Object and apply Similarity Search
+    retriever = db.as_retriever(search_type="similarity", search_kwargs={'k': 5})
+
+    # Initialize a Chat Prompt Template
+    PROMPT_TEMPLATE = """
+    You are a highly knowledgeable assistant specializing in pharmaceutical sciences. 
+    Answer the question based only on the following context:
+    {context}
+
+    Answer the question based on the above context:
+    {question}
+
+    Use the provided context to answer the user's question accurately and concisely.
+    Don't justify your answers.
+    Don't give information not mentioned in the CONTEXT INFORMATION.
+    Do not say "according to the context" or "mentioned in the context" or similar.
+    """
+
+    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
+
+    # Initialize a Generator (i.e. Chat Model)
+    chat_model = ChatGoogleGenerativeAI(
+        model="gemini-1.5-pro",
+        temperature=1
+    )
+
+    # Initialize a Output Parser
+    output_parser = StrOutputParser()
+
+    # RAG Chain
+    rag_chain = {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt_template | chat_model | output_parser
+
+    # Invoke the Chain
+    response = rag_chain.invoke(query)
+
+    return response
+
+def main():
+    st.set_page_config(page_title="PharmaQuery", page_icon=":microscope:")
+    st.header("Pharmaceutical Insight Retrieval System")
+
+    query = st.text_area(
+        ":bulb: Enter your query about the Pharmaceutical Industry:",
+        placeholder="e.g., What are the AI applications in drug discovery?"
+    )
+
+    if st.button("Submit"):
+        if not query:
+            st.warning("Please ask a question")
+        
+        else:
+            with st.spinner("Thinking..."):
+                result = run_rag_chain(query=query)
+                st.write(result)
+
+    with st.sidebar:
+        st.title("Upload your research documents (Optional) :memo:")
+        pdf_docs = st.file_uploader("Enhance your query by uploading PDF files related to Pharmaceutical Sciences.",
+                                    type=["pdf"],
+                                    accept_multiple_files=True
+        )
+        
+        if st.button("Submit & Process"):
+            if not pdf_docs:
+                st.warning("Please upload the file")
+
+            else:
+                with st.spinner("Processing your documents..."):
+                    add_to_db(pdf_docs)
+                    st.success(":file_folder: Documents successfully added to the database!")
+
+    # Sidebar Footer
+    st.sidebar.write("Built with ❤️ by [Charan](https://www.linkedin.com/in/codewithcharan/)")
+             
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/rag_tutorials/rag_chain/requirements.txt b/rag_tutorials/rag_chain/requirements.txt
new file mode 100644
index 0000000..7b5fa46
--- /dev/null
+++ b/rag_tutorials/rag_chain/requirements.txt
@@ -0,0 +1,9 @@
+streamlit
+langchain-google-genai
+langchain-chroma
+langchain-community
+langchain-core
+chromadb
+sentence-transformers
+PyPDF2
+python-dotenv