From 783f88b18cdae399c2a7016ffd23938929412572 Mon Sep 17 00:00:00 2001
From: "abir.chebbi" <abir.chebbi@hes-so.ch>
Date: Tue, 10 Sep 2024 13:54:22 +0200
Subject: [PATCH] add part 2

---
 ...reate-Vector-DB.py => create-vector-dB.py} |   2 +-
 Part 1/{Delete-s3.py => delete-s3.py}         |   0
 Part 1/main.py                                |  14 +-
 Part 1/test.py                                |   1 -
 .../create_instance.py                        |   0
 Part 2/main.py                                | 126 ++++++++++++++++++
 Part 2/requirements.txt                       |   1 +
 Part 2/run_app.py                             |   0
 README.md                                     |   8 +-
 9 files changed, 141 insertions(+), 11 deletions(-)
 rename Part 1/{Create-Vector-DB.py => create-vector-dB.py} (99%)
 rename Part 1/{Delete-s3.py => delete-s3.py} (100%)
 rename Part 1/Delete-Vector-DB.py => Part 2/create_instance.py (100%)
 create mode 100644 Part 2/main.py
 create mode 100644 Part 2/requirements.txt
 create mode 100644 Part 2/run_app.py

diff --git a/Part 1/Create-Vector-DB.py b/Part 1/create-vector-dB.py
similarity index 99%
rename from Part 1/Create-Vector-DB.py
rename to Part 1/create-vector-dB.py
index 4e63c5e..99ddcb3 100644
--- a/Part 1/Create-Vector-DB.py	
+++ b/Part 1/create-vector-dB.py	
@@ -6,7 +6,7 @@ import time
 
 client = boto3.client('opensearchserverless')
 service = 'aoss'
-Vector_store_name='test1'
+Vector_store_name='test2'
 
 def createEncryptionPolicy(client):
     """Creates an encryption policy that matches all collections beginning with test"""
diff --git a/Part 1/Delete-s3.py b/Part 1/delete-s3.py
similarity index 100%
rename from Part 1/Delete-s3.py
rename to Part 1/delete-s3.py
diff --git a/Part 1/main.py b/Part 1/main.py
index e42c3cc..36feb7e 100644
--- a/Part 1/main.py	
+++ b/Part 1/main.py	
@@ -12,7 +12,7 @@ import json
 
 ##  Local directory for storing PDF files
 LOCAL_DIR = "pdfs" 
-index_name = "cloud_lecture_test3"
+index_name = "cloud_lecture"
 
 
 ## S3_client
@@ -29,7 +29,7 @@ credentials = boto3.Session().get_credentials()
 awsauth = AWSV4SignerAuth(credentials, 'us-east-1', 'aoss')
 
 ## Vector DB endpoint
-host= 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com'
+host= 'j6phg34iv0f2rlvxwawd.us-east-1.aoss.amazonaws.com'
 
 ## Opensearch Client
 OpenSearch_client = OpenSearch(
@@ -142,16 +142,16 @@ def main():
     download_documents(BUCKET_NAME,LOCAL_DIR)
     loader= PyPDFDirectoryLoader(LOCAL_DIR)
     docs = loader.load()
-    print(docs[80])
+    print(docs[1])
     chunks = split_text(docs, 1000, 100)
-    print(chunks[80])
+    print(chunks[1])
     embeddings= generate_embeddings(bedrock_client, chunks)
-    print(embeddings[80])
+    print(embeddings[1])
     texts = [chunk.page_content for chunk in chunks]
      # Prepare metadata for each chunk
     meta_data = [{'source': chunk.metadata['source'], 'page': chunk.metadata['page'] + 1} for chunk in chunks]
-    print(embeddings[80])
-    print(meta_data[80])
+    print(embeddings[1])
+    print(meta_data[1])
     store_embeddings(embeddings, texts, meta_data ,host, awsauth,index_name)
 
 
diff --git a/Part 1/test.py b/Part 1/test.py
index 3f5cd21..2b25ba2 100644
--- a/Part 1/test.py	
+++ b/Part 1/test.py	
@@ -14,7 +14,6 @@ opensearch_client = OpenSearch(
     use_ssl=True,
     verify_certs=True,
     connection_class=RequestsHttpConnection,
-    timeout=300
 )
 
 # Embeddings Client
diff --git a/Part 1/Delete-Vector-DB.py b/Part 2/create_instance.py
similarity index 100%
rename from Part 1/Delete-Vector-DB.py
rename to Part 2/create_instance.py
diff --git a/Part 2/main.py b/Part 2/main.py
new file mode 100644
index 0000000..713a2bd
--- /dev/null
+++ b/Part 2/main.py	
@@ -0,0 +1,126 @@
+import boto3
+import streamlit as st
+## Bedrock
+from langchain.llms.bedrock import Bedrock
+## prompt and chain
+from langchain.chains import RetrievalQA
+from langchain_community.embeddings import BedrockEmbeddings
+from langchain_community.chat_models import BedrockChat
+from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
+
+from langchain import PromptTemplate
+# Embeddings Client
+bedrock_client = boto3.client(service_name="bedrock-runtime")
+
+# configuring streamlit page settings
+st.set_page_config(
+    page_title="cloud lecture lab",
+    page_icon="💬",
+    layout="centered"
+)
+
+
+# streamlit page title
+st.title("Chat with your lecture")
+
+
+# AWS and OpenSearch Configuration
+host = 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com'  
+index_name = 'cloud_lecture'
+awsauth = AWSV4SignerAuth(boto3.Session().get_credentials(), 'us-east-1', 'aoss')
+
+# OpenSearch Client
+opensearch_client = OpenSearch(
+    hosts=[{'host': host, 'port': 443}],
+    http_auth=awsauth,
+    use_ssl=True,
+    verify_certs=True,
+    connection_class=RequestsHttpConnection,
+)
+
+def get_embedding(question, bedrock_client):
+    embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
+    embedding = embeddings_model.embed_query(question)
+    return embedding
+
+def similarity_search(embed_query, index_name):
+    query_body = {
+        "size": 5,
+        "query": {
+            "knn": {
+                "vector_field": {
+                    "vector": embed_query,
+                    "k": 5
+                }
+            }
+        }
+    }
+    response = opensearch_client.search(index=index_name, body=query_body)
+    return response['hits']['hits']
+
+def prepare_prompt(question, context):
+    template = """
+    You are a Professor. The student will ask you a questions about the lecture. 
+    Use following piece of context to answer the question. 
+    If you don't know the answer, just say you don't know. 
+
+    Context:   <context>
+    {context}
+    </context>
+    Question: {question}
+    Answer: 
+
+    """
+
+    prompt = PromptTemplate(
+    template=template, 
+    input_variables=['context', 'question']
+    )
+    prompt_formatted_str = prompt.format(context=context, question= question)
+    return prompt_formatted_str
+
+def generate_answer(prompt):
+    model = BedrockChat(model_id="anthropic.claude-v2", model_kwargs={"temperature": 0.1})
+    answer = model.invoke(prompt)
+    return answer
+
+
+def main():
+
+    # initialize chat session in streamlit if not already present
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = []
+
+        
+    # display chat history
+    for message in st.session_state.chat_history:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+
+
+    # input field for user's message
+    user_prompt = st.chat_input("Ask a question for your knowledge base")
+
+    if user_prompt:
+    # add user's message to chat and display it
+        st.chat_message("user").markdown(user_prompt)
+        st.session_state.chat_history.append({"role": "user", "content": user_prompt})
+        # Generate and display answer
+        print(user_prompt)
+        embed_question= get_embedding(user_prompt,bedrock_client)
+        print(embed_question)
+        sim_results = similarity_search(embed_question, index_name)
+        context = [i['_source']['text'] for i in sim_results]
+        print(context)
+        prompt = prepare_prompt(user_prompt, context)
+        print(prompt)
+        answer = generate_answer(prompt)
+        st.session_state.chat_history.append({"role": "system", "content": answer})
+        for message in st.session_state.chat_history[-1:]:
+            with st.chat_message(message["role"]):
+                st.markdown(message["content"])
+
+if __name__== "__main__":
+    main()
+
+ 
diff --git a/Part 2/requirements.txt b/Part 2/requirements.txt
new file mode 100644
index 0000000..12a4706
--- /dev/null
+++ b/Part 2/requirements.txt	
@@ -0,0 +1 @@
+streamlit
diff --git a/Part 2/run_app.py b/Part 2/run_app.py
new file mode 100644
index 0000000..e69de29
diff --git a/README.md b/README.md
index c15c86a..b39d74b 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to Session 1)
 2. Ensure python is installed: python 3.8 or higher
 2. Install required python libraries listed in the 'requirements.txt': 
-`pip install -r requirement.txt`
+`pip3 install -r requirements.txt`
 
 
 ## Part 1: 
@@ -31,4 +31,8 @@ The main.py script will:
 1. Download PDF files from the S3 bucket.
 2. Split them into chunks.
 3. Generate embeddings from the chunks.
-4. Store these embeddings in the OpenSearch Vector DB.
\ No newline at end of file
+4. Store these embeddings in the OpenSearch Vector DB.
+
+
+## Part 2:
+
-- 
GitLab