From 64a49f187fe7c5cbd0cdb5c9fc1d7f39fe822b33 Mon Sep 17 00:00:00 2001 From: "abir.chebbi" <abir.chebbi@hes-so.ch> Date: Thu, 12 Sep 2024 14:31:34 +0200 Subject: [PATCH] Adjust the readme --- Part 1/create-S3-and-put-docs.py | 8 +++--- Part 1/create-vector-db.py | 4 +-- Part 1/main.py | 6 ++--- Part 2/main.py | 34 ++++++++++++++++---------- README.md | 42 ++++++++++++++++++++++---------- 5 files changed, 59 insertions(+), 35 deletions(-) diff --git a/Part 1/create-S3-and-put-docs.py b/Part 1/create-S3-and-put-docs.py index 19b310f..2728452 100644 --- a/Part 1/create-S3-and-put-docs.py +++ b/Part 1/create-S3-and-put-docs.py @@ -24,7 +24,7 @@ def write_files(s3_client, directory, bucket): Key=filename ) print(f"{filename} uploaded successfully.") - + def main(bucket_name, local_dir): s3_client = boto3.client('s3') create_bucket(s3_client, bucket_name) @@ -32,8 +32,8 @@ def main(bucket_name, local_dir): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Upload PDF files to an S3 bucket") - parser.add_argument("bucket_name", help="The name of the S3 bucket to which the files will be uploaded") - parser.add_argument("LOCAL_DIR", help="The name of the folder to put the pdf files") + parser.add_argument("--bucket_name", help="The name of the S3 bucket to which the files will be uploaded") + parser.add_argument("--local_path", help="The name of the folder to put the pdf files") args = parser.parse_args() - main(args.bucket_name, args.LOCAL_DIR) + main(args.bucket_name, args.local_path) diff --git a/Part 1/create-vector-db.py b/Part 1/create-vector-db.py index 68aedb8..ca98882 100644 --- a/Part 1/create-vector-db.py +++ b/Part 1/create-vector-db.py @@ -152,7 +152,7 @@ def main(collection_name,IAM_USER): if __name__== "__main__": parser = argparse.ArgumentParser(description="Create collection") - parser.add_argument("collection_name", help="The name of the collection") - parser.add_argument("iam_user", help="The iam user") + parser.add_argument("--collection_name", help="The name of the collection") + parser.add_argument("--iam_user", help="The iam user") args = parser.parse_args() main(args.collection_name,args.iam_user) diff --git a/Part 1/main.py b/Part 1/main.py index ab8850d..3a6c717 100644 --- a/Part 1/main.py +++ b/Part 1/main.py @@ -165,8 +165,8 @@ def main(bucket_name, endpoint,index_name): if __name__== "__main__": parser = argparse.ArgumentParser(description="Process PDF documents and store their embeddings.") - parser.add_argument("bucket_name", help="The S3 bucket name where documents are stored") - parser.add_argument("endpoint", help="The OpenSearch service endpoint") - parser.add_argument("index_name", help="The name of the OpenSearch index") + parser.add_argument("--bucket_name", help="The S3 bucket name where documents are stored") + parser.add_argument("--endpoint", help="The OpenSearch service endpoint") + parser.add_argument("--index_name", help="The name of the OpenSearch index") args = parser.parse_args() main(args.bucket_name, args.endpoint, args.index_name) diff --git a/Part 2/main.py b/Part 2/main.py index 0a9d201..cf8836e 100644 --- a/Part 2/main.py +++ b/Part 2/main.py @@ -7,8 +7,9 @@ from langchain.chains import RetrievalQA from langchain_community.embeddings import BedrockEmbeddings from langchain_community.chat_models import BedrockChat from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth - from langchain import PromptTemplate +import argparse + # Embeddings Client bedrock_client = boto3.client(service_name="bedrock-runtime") @@ -24,26 +25,26 @@ st.set_page_config( st.title("Chat with your lecture") -# AWS and OpenSearch Configuration -host = 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com' -index_name = 'cloud_lecture' -awsauth = AWSV4SignerAuth(boto3.Session().get_credentials(), 'us-east-1', 'aoss') + # OpenSearch Client -opensearch_client = OpenSearch( - hosts=[{'host': host, 'port': 443}], +def ospensearch_client(endpoint): + awsauth = AWSV4SignerAuth(boto3.Session().get_credentials(), 'us-east-1', 'aoss') + client = OpenSearch( + hosts=[{'host': endpoint, 'port': 443}], http_auth=awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection, -) + ) + return client def get_embedding(question, bedrock_client): embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client) embedding = embeddings_model.embed_query(question) return embedding -def similarity_search(embed_query, index_name): +def similarity_search(client, embed_query, index_name): query_body = { "size": 5, "query": { @@ -55,7 +56,7 @@ def similarity_search(embed_query, index_name): } } } - response = opensearch_client.search(index=index_name, body=query_body) + response = client.search(index=index_name, body=query_body) return response['hits']['hits'] def prepare_prompt(question, context): @@ -85,7 +86,9 @@ def generate_answer(prompt): return answer -def main(): +def main(endpoint, index_name): + + oss_client= ospensearch_client(endpoint) # initialize chat session in streamlit if not already present if "chat_history" not in st.session_state: @@ -110,7 +113,7 @@ def main(): embed_question= get_embedding(user_prompt,bedrock_client) print(embed_question) - sim_results = similarity_search(embed_question, index_name) + sim_results = similarity_search(oss_client, embed_question, index_name) context = [i['_source']['text'] for i in sim_results] print(context) prompt = prepare_prompt(user_prompt, context) @@ -122,6 +125,11 @@ def main(): st.markdown(message["content"]) if __name__== "__main__": - main() + # Argument parsing + parser = argparse.ArgumentParser(description='Configure endpoint and index name for the lecture chat application.') + parser.add_argument('endpoint', type=str, help='The endpoint for the OpenSearch service.') + parser.add_argument('index_name', type=str, help='The index name for storing embeddings.') + args = parser.parse_args() + main(args.endpoint, args.index_name) diff --git a/README.md b/README.md index a6abdd2..1a1c3bf 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,53 @@ # chatbot-lab ## Set up environment -1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to Session 1) -2. Ensure python is installed: python 3.8 or higher -2. Install required python libraries listed in the 'requirements.txt': +1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to the setup guide provided in Session 1). +2. Ensure python is installed: python 3.8 or higher. +3. Install required python libraries listed in the 'requirements.txt': `pip3 install -r requirements.txt` ## Part 1: ### Step 1: Object storage Creation -Create an S3 bucket and upload a few PDF files (Detailed steps are provided in the first session). +Create an S3 bucket and upload a few PDF files by running: +`python create-S3-and-put-docs.py --bucket_name [YourBucketName] --local_path [PathToYourPDFFiles]` +Where: +`--bucket_name`: The name for the new S3 bucket to be created. +`--local_path`: The local directory path where the PDF files are stored. + ### Step 2: Vector Store Creation -To set up the Vector Store, run the following command: `python create-vector-db.py` +Create a vector database for storing embeddings by running: +`python create-vector-db.py --collection_name [Name_of_colletion] --IAM_user [YourIAM_User]` +Where: +`--collection_name`: Name of the collection that you want to create to store embeddings. +`--IAM_USER` : For example for group 14 the IAM USER = master-group-14 + This script performs the following actions: -* Set up the security policies: Sets up encryption, network, and data access policies for collections starting with "test". -* Vector Store Initialization: Creates a vector store named test1, specifically designed for vector search operations. -* Endpoint Retrieval: After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use. +* Sets up encryption, network, and data access policies for the collection. +* Creates a vector store with the name collection entered as argument. +* After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use. ### Step 3: Vectorizing the PDF Files -After setting up the S3 bucket and Vector Store, prepare to vectorize the PDF files: -* In main.py, update the S3 bucket name to the one you created. -* Update the Vector Store endpoint with the one provided by the setup script. -* Execute the processing script: `python main.py` +After setting up the S3 bucket and Vector Store, we could process PDF files to generate and store embeddings in the vector database. + +Run: +`python main.py --bucket_name [YourBucketName] --endpoint [YourVectorDBEndpoint]` + +Where: +`--bucket_name`: The name of the S3 bucket containing the PDF files. +`--endpoint`: Endpoint for the vector database. +`--index_name`: The index_name where to store the embeddings in the collection. The main.py script will: 1. Download PDF files from the S3 bucket. 2. Split them into chunks. 3. Generate embeddings from the chunks. -4. Store these embeddings in the OpenSearch Vector DB. +4. Create an index in the vector DB. +5. Store these embeddings in the OpenSearch Vector DB. ## Part 2: -- GitLab