From 783f88b18cdae399c2a7016ffd23938929412572 Mon Sep 17 00:00:00 2001 From: "abir.chebbi" <abir.chebbi@hes-so.ch> Date: Tue, 10 Sep 2024 13:54:22 +0200 Subject: [PATCH] add part 2 --- ...reate-Vector-DB.py => create-vector-dB.py} | 2 +- Part 1/{Delete-s3.py => delete-s3.py} | 0 Part 1/main.py | 14 +- Part 1/test.py | 1 - .../create_instance.py | 0 Part 2/main.py | 126 ++++++++++++++++++ Part 2/requirements.txt | 1 + Part 2/run_app.py | 0 README.md | 8 +- 9 files changed, 141 insertions(+), 11 deletions(-) rename Part 1/{Create-Vector-DB.py => create-vector-dB.py} (99%) rename Part 1/{Delete-s3.py => delete-s3.py} (100%) rename Part 1/Delete-Vector-DB.py => Part 2/create_instance.py (100%) create mode 100644 Part 2/main.py create mode 100644 Part 2/requirements.txt create mode 100644 Part 2/run_app.py diff --git a/Part 1/Create-Vector-DB.py b/Part 1/create-vector-dB.py similarity index 99% rename from Part 1/Create-Vector-DB.py rename to Part 1/create-vector-dB.py index 4e63c5e..99ddcb3 100644 --- a/Part 1/Create-Vector-DB.py +++ b/Part 1/create-vector-dB.py @@ -6,7 +6,7 @@ import time client = boto3.client('opensearchserverless') service = 'aoss' -Vector_store_name='test1' +Vector_store_name='test2' def createEncryptionPolicy(client): """Creates an encryption policy that matches all collections beginning with test""" diff --git a/Part 1/Delete-s3.py b/Part 1/delete-s3.py similarity index 100% rename from Part 1/Delete-s3.py rename to Part 1/delete-s3.py diff --git a/Part 1/main.py b/Part 1/main.py index e42c3cc..36feb7e 100644 --- a/Part 1/main.py +++ b/Part 1/main.py @@ -12,7 +12,7 @@ import json ## Local directory for storing PDF files LOCAL_DIR = "pdfs" -index_name = "cloud_lecture_test3" +index_name = "cloud_lecture" ## S3_client @@ -29,7 +29,7 @@ credentials = boto3.Session().get_credentials() awsauth = AWSV4SignerAuth(credentials, 'us-east-1', 'aoss') ## Vector DB endpoint -host= 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com' +host= 'j6phg34iv0f2rlvxwawd.us-east-1.aoss.amazonaws.com' ## Opensearch Client OpenSearch_client = OpenSearch( @@ -142,16 +142,16 @@ def main(): download_documents(BUCKET_NAME,LOCAL_DIR) loader= PyPDFDirectoryLoader(LOCAL_DIR) docs = loader.load() - print(docs[80]) + print(docs[1]) chunks = split_text(docs, 1000, 100) - print(chunks[80]) + print(chunks[1]) embeddings= generate_embeddings(bedrock_client, chunks) - print(embeddings[80]) + print(embeddings[1]) texts = [chunk.page_content for chunk in chunks] # Prepare metadata for each chunk meta_data = [{'source': chunk.metadata['source'], 'page': chunk.metadata['page'] + 1} for chunk in chunks] - print(embeddings[80]) - print(meta_data[80]) + print(embeddings[1]) + print(meta_data[1]) store_embeddings(embeddings, texts, meta_data ,host, awsauth,index_name) diff --git a/Part 1/test.py b/Part 1/test.py index 3f5cd21..2b25ba2 100644 --- a/Part 1/test.py +++ b/Part 1/test.py @@ -14,7 +14,6 @@ opensearch_client = OpenSearch( use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection, - timeout=300 ) # Embeddings Client diff --git a/Part 1/Delete-Vector-DB.py b/Part 2/create_instance.py similarity index 100% rename from Part 1/Delete-Vector-DB.py rename to Part 2/create_instance.py diff --git a/Part 2/main.py b/Part 2/main.py new file mode 100644 index 0000000..713a2bd --- /dev/null +++ b/Part 2/main.py @@ -0,0 +1,126 @@ +import boto3 +import streamlit as st +## Bedrock +from langchain.llms.bedrock import Bedrock +## prompt and chain +from langchain.chains import RetrievalQA +from langchain_community.embeddings import BedrockEmbeddings +from langchain_community.chat_models import BedrockChat +from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth + +from langchain import PromptTemplate +# Embeddings Client +bedrock_client = boto3.client(service_name="bedrock-runtime") + +# configuring streamlit page settings +st.set_page_config( + page_title="cloud lecture lab", + page_icon="💬", + layout="centered" +) + + +# streamlit page title +st.title("Chat with your lecture") + + +# AWS and OpenSearch Configuration +host = 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com' +index_name = 'cloud_lecture' +awsauth = AWSV4SignerAuth(boto3.Session().get_credentials(), 'us-east-1', 'aoss') + +# OpenSearch Client +opensearch_client = OpenSearch( + hosts=[{'host': host, 'port': 443}], + http_auth=awsauth, + use_ssl=True, + verify_certs=True, + connection_class=RequestsHttpConnection, +) + +def get_embedding(question, bedrock_client): + embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client) + embedding = embeddings_model.embed_query(question) + return embedding + +def similarity_search(embed_query, index_name): + query_body = { + "size": 5, + "query": { + "knn": { + "vector_field": { + "vector": embed_query, + "k": 5 + } + } + } + } + response = opensearch_client.search(index=index_name, body=query_body) + return response['hits']['hits'] + +def prepare_prompt(question, context): + template = """ + You are a Professor. The student will ask you a questions about the lecture. + Use following piece of context to answer the question. + If you don't know the answer, just say you don't know. + + Context: <context> + {context} + </context> + Question: {question} + Answer: + + """ + + prompt = PromptTemplate( + template=template, + input_variables=['context', 'question'] + ) + prompt_formatted_str = prompt.format(context=context, question= question) + return prompt_formatted_str + +def generate_answer(prompt): + model = BedrockChat(model_id="anthropic.claude-v2", model_kwargs={"temperature": 0.1}) + answer = model.invoke(prompt) + return answer + + +def main(): + + # initialize chat session in streamlit if not already present + if "chat_history" not in st.session_state: + st.session_state.chat_history = [] + + + # display chat history + for message in st.session_state.chat_history: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + + + # input field for user's message + user_prompt = st.chat_input("Ask a question for your knowledge base") + + if user_prompt: + # add user's message to chat and display it + st.chat_message("user").markdown(user_prompt) + st.session_state.chat_history.append({"role": "user", "content": user_prompt}) + # Generate and display answer + print(user_prompt) + embed_question= get_embedding(user_prompt,bedrock_client) + print(embed_question) + sim_results = similarity_search(embed_question, index_name) + context = [i['_source']['text'] for i in sim_results] + print(context) + prompt = prepare_prompt(user_prompt, context) + print(prompt) + answer = generate_answer(prompt) + st.session_state.chat_history.append({"role": "system", "content": answer}) + for message in st.session_state.chat_history[-1:]: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + +if __name__== "__main__": + main() + + diff --git a/Part 2/requirements.txt b/Part 2/requirements.txt new file mode 100644 index 0000000..12a4706 --- /dev/null +++ b/Part 2/requirements.txt @@ -0,0 +1 @@ +streamlit diff --git a/Part 2/run_app.py b/Part 2/run_app.py new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index c15c86a..b39d74b 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ 1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to Session 1) 2. Ensure python is installed: python 3.8 or higher 2. Install required python libraries listed in the 'requirements.txt': -`pip install -r requirement.txt` +`pip3 install -r requirements.txt` ## Part 1: @@ -31,4 +31,8 @@ The main.py script will: 1. Download PDF files from the S3 bucket. 2. Split them into chunks. 3. Generate embeddings from the chunks. -4. Store these embeddings in the OpenSearch Vector DB. \ No newline at end of file +4. Store these embeddings in the OpenSearch Vector DB. + + +## Part 2: + -- GitLab