From 64a49f187fe7c5cbd0cdb5c9fc1d7f39fe822b33 Mon Sep 17 00:00:00 2001
From: "abir.chebbi" <abir.chebbi@hes-so.ch>
Date: Thu, 12 Sep 2024 14:31:34 +0200
Subject: [PATCH] Adjust the readme

---
 Part 1/create-S3-and-put-docs.py |  8 +++---
 Part 1/create-vector-db.py       |  4 +--
 Part 1/main.py                   |  6 ++---
 Part 2/main.py                   | 34 ++++++++++++++++----------
 README.md                        | 42 ++++++++++++++++++++++----------
 5 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/Part 1/create-S3-and-put-docs.py b/Part 1/create-S3-and-put-docs.py
index 19b310f..2728452 100644
--- a/Part 1/create-S3-and-put-docs.py	
+++ b/Part 1/create-S3-and-put-docs.py	
@@ -24,7 +24,7 @@ def write_files(s3_client, directory, bucket):
                     Key=filename
                 )
                 print(f"{filename} uploaded successfully.")
-                
+
 def main(bucket_name, local_dir):
     s3_client = boto3.client('s3')
     create_bucket(s3_client, bucket_name)
@@ -32,8 +32,8 @@ def main(bucket_name, local_dir):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Upload PDF files to an S3 bucket")
-    parser.add_argument("bucket_name", help="The name of the S3 bucket to which the files will be uploaded")
-    parser.add_argument("LOCAL_DIR", help="The name of the folder to put the pdf files")
+    parser.add_argument("--bucket_name", help="The name of the S3 bucket to which the files will be uploaded")
+    parser.add_argument("--local_path", help="The name of the folder to put the pdf files")
     args = parser.parse_args()
-    main(args.bucket_name, args.LOCAL_DIR)
+    main(args.bucket_name, args.local_path)
 
diff --git a/Part 1/create-vector-db.py b/Part 1/create-vector-db.py
index 68aedb8..ca98882 100644
--- a/Part 1/create-vector-db.py	
+++ b/Part 1/create-vector-db.py	
@@ -152,7 +152,7 @@ def main(collection_name,IAM_USER):
 
 if __name__== "__main__":
     parser = argparse.ArgumentParser(description="Create collection")
-    parser.add_argument("collection_name", help="The name of the collection")
-    parser.add_argument("iam_user", help="The iam user")
+    parser.add_argument("--collection_name", help="The name of the collection")
+    parser.add_argument("--iam_user", help="The iam user")
     args = parser.parse_args()
     main(args.collection_name,args.iam_user)
diff --git a/Part 1/main.py b/Part 1/main.py
index ab8850d..3a6c717 100644
--- a/Part 1/main.py	
+++ b/Part 1/main.py	
@@ -165,8 +165,8 @@ def main(bucket_name, endpoint,index_name):
 
 if __name__== "__main__":
     parser = argparse.ArgumentParser(description="Process PDF documents and store their embeddings.")
-    parser.add_argument("bucket_name", help="The S3 bucket name where documents are stored")
-    parser.add_argument("endpoint", help="The OpenSearch service endpoint")
-    parser.add_argument("index_name", help="The name of the OpenSearch index")
+    parser.add_argument("--bucket_name", help="The S3 bucket name where documents are stored")
+    parser.add_argument("--endpoint", help="The OpenSearch service endpoint")
+    parser.add_argument("--index_name", help="The name of the OpenSearch index")
     args = parser.parse_args()
     main(args.bucket_name, args.endpoint, args.index_name)
diff --git a/Part 2/main.py b/Part 2/main.py
index 0a9d201..cf8836e 100644
--- a/Part 2/main.py	
+++ b/Part 2/main.py	
@@ -7,8 +7,9 @@ from langchain.chains import RetrievalQA
 from langchain_community.embeddings import BedrockEmbeddings
 from langchain_community.chat_models import BedrockChat
 from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
-
 from langchain import PromptTemplate
+import argparse
+
 # Embeddings Client
 bedrock_client = boto3.client(service_name="bedrock-runtime")
 
@@ -24,26 +25,26 @@ st.set_page_config(
 st.title("Chat with your lecture")
 
 
-# AWS and OpenSearch Configuration
-host = 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com'  
-index_name = 'cloud_lecture'
-awsauth = AWSV4SignerAuth(boto3.Session().get_credentials(), 'us-east-1', 'aoss')
+
 
 # OpenSearch Client
-opensearch_client = OpenSearch(
-    hosts=[{'host': host, 'port': 443}],
+def ospensearch_client(endpoint):
+    awsauth = AWSV4SignerAuth(boto3.Session().get_credentials(), 'us-east-1', 'aoss')
+    client = OpenSearch(
+    hosts=[{'host': endpoint, 'port': 443}],
     http_auth=awsauth,
     use_ssl=True,
     verify_certs=True,
     connection_class=RequestsHttpConnection,
-)
+    )
+    return client
 
 def get_embedding(question, bedrock_client):
     embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
     embedding = embeddings_model.embed_query(question)
     return embedding
 
-def similarity_search(embed_query, index_name):
+def similarity_search(client, embed_query, index_name):
     query_body = {
         "size": 5,
         "query": {
@@ -55,7 +56,7 @@ def similarity_search(embed_query, index_name):
             }
         }
     }
-    response = opensearch_client.search(index=index_name, body=query_body)
+    response = client.search(index=index_name, body=query_body)
     return response['hits']['hits']
 
 def prepare_prompt(question, context):
@@ -85,7 +86,9 @@ def generate_answer(prompt):
     return answer
 
 
-def main():
+def main(endpoint, index_name):
+
+    oss_client= ospensearch_client(endpoint)
 
     # initialize chat session in streamlit if not already present
     if "chat_history" not in st.session_state:
@@ -110,7 +113,7 @@ def main():
  
         embed_question= get_embedding(user_prompt,bedrock_client)
         print(embed_question)
-        sim_results = similarity_search(embed_question, index_name)
+        sim_results = similarity_search(oss_client, embed_question, index_name)
         context = [i['_source']['text'] for i in sim_results]
         print(context)
         prompt = prepare_prompt(user_prompt, context)
@@ -122,6 +125,11 @@ def main():
                 st.markdown(message["content"])
 
 if __name__== "__main__":
-    main()
+    # Argument parsing
+    parser = argparse.ArgumentParser(description='Configure endpoint and index name for the lecture chat application.')
+    parser.add_argument('endpoint', type=str, help='The endpoint for the OpenSearch service.')
+    parser.add_argument('index_name', type=str, help='The index name for storing embeddings.')
+    args = parser.parse_args()
+    main(args.endpoint, args.index_name)
 
  
diff --git a/README.md b/README.md
index a6abdd2..1a1c3bf 100644
--- a/README.md
+++ b/README.md
@@ -1,37 +1,53 @@
 # chatbot-lab
 
 ## Set up environment
-1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to Session 1)
-2. Ensure python is installed: python 3.8 or higher
-2. Install required python libraries listed in the 'requirements.txt': 
+1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to the setup guide provided in Session 1).
+2. Ensure python is installed: python 3.8 or higher.
+3. Install required python libraries listed in the 'requirements.txt': 
 `pip3 install -r requirements.txt`
 
 
 ## Part 1: 
 
 ### Step 1: Object storage Creation
-Create an S3 bucket and upload a few PDF files (Detailed steps are provided in the first session).
+Create an S3 bucket and upload a few PDF files by running: 
+`python create-S3-and-put-docs.py --bucket_name [YourBucketName] --local_path [PathToYourPDFFiles]`
+Where:
+`--bucket_name`: The name for the new S3 bucket to be created.
+`--local_path`: The local directory path where the PDF files are stored.
+
 
 ### Step 2: Vector Store Creation
-To set up the Vector Store, run the following command: `python create-vector-db.py`
+Create a vector database for storing embeddings by running: 
+`python create-vector-db.py --collection_name [Name_of_colletion] --IAM_user [YourIAM_User]`
+Where: 
+`--collection_name`: Name of the collection that you want to create to store embeddings.
+`--IAM_USER` : For example for group 14 the IAM USER = master-group-14
+
 
 This script performs the following actions:
 
-* Set up the security policies: Sets up encryption, network, and data access policies for collections starting with "test".
-* Vector Store Initialization: Creates a vector store named test1, specifically designed for vector search operations.
-* Endpoint Retrieval: After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use.
+* Sets up encryption, network, and data access policies for the collection.
+* Creates a vector store with the name collection entered as argument.
+* After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use.
 
 ### Step 3: Vectorizing the PDF Files
-After setting up the S3 bucket and Vector Store, prepare to vectorize the PDF files:
-* In main.py, update the S3 bucket name to the one you created.
-* Update the Vector Store endpoint with the one provided by the setup script.
-* Execute the processing script: `python main.py`
+After setting up the S3 bucket and Vector Store, we could process PDF files to generate and store embeddings in the vector database.
+
+Run: 
+`python main.py --bucket_name [YourBucketName] --endpoint [YourVectorDBEndpoint]`
+
+Where: 
+`--bucket_name`: The name of the S3 bucket containing the PDF files.
+`--endpoint`: Endpoint for the vector database.
+`--index_name`: The index_name where to store the embeddings in the collection.
 
 The main.py script will:
 1. Download PDF files from the S3 bucket.
 2. Split them into chunks.
 3. Generate embeddings from the chunks.
-4. Store these embeddings in the OpenSearch Vector DB.
+4. Create an index in the vector DB.
+5. Store these embeddings in the OpenSearch Vector DB.
 
 
 ## Part 2:
-- 
GitLab