Skip to content
Snippets Groups Projects
Commit 64a49f18 authored by abir.chebbi's avatar abir.chebbi
Browse files

Adjust the readme

parent a5728da8
No related branches found
No related tags found
No related merge requests found
...@@ -24,7 +24,7 @@ def write_files(s3_client, directory, bucket): ...@@ -24,7 +24,7 @@ def write_files(s3_client, directory, bucket):
Key=filename Key=filename
) )
print(f"{filename} uploaded successfully.") print(f"{filename} uploaded successfully.")
def main(bucket_name, local_dir): def main(bucket_name, local_dir):
s3_client = boto3.client('s3') s3_client = boto3.client('s3')
create_bucket(s3_client, bucket_name) create_bucket(s3_client, bucket_name)
...@@ -32,8 +32,8 @@ def main(bucket_name, local_dir): ...@@ -32,8 +32,8 @@ def main(bucket_name, local_dir):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Upload PDF files to an S3 bucket") parser = argparse.ArgumentParser(description="Upload PDF files to an S3 bucket")
parser.add_argument("bucket_name", help="The name of the S3 bucket to which the files will be uploaded") parser.add_argument("--bucket_name", help="The name of the S3 bucket to which the files will be uploaded")
parser.add_argument("LOCAL_DIR", help="The name of the folder to put the pdf files") parser.add_argument("--local_path", help="The name of the folder to put the pdf files")
args = parser.parse_args() args = parser.parse_args()
main(args.bucket_name, args.LOCAL_DIR) main(args.bucket_name, args.local_path)
...@@ -152,7 +152,7 @@ def main(collection_name,IAM_USER): ...@@ -152,7 +152,7 @@ def main(collection_name,IAM_USER):
if __name__== "__main__": if __name__== "__main__":
parser = argparse.ArgumentParser(description="Create collection") parser = argparse.ArgumentParser(description="Create collection")
parser.add_argument("collection_name", help="The name of the collection") parser.add_argument("--collection_name", help="The name of the collection")
parser.add_argument("iam_user", help="The iam user") parser.add_argument("--iam_user", help="The iam user")
args = parser.parse_args() args = parser.parse_args()
main(args.collection_name,args.iam_user) main(args.collection_name,args.iam_user)
...@@ -165,8 +165,8 @@ def main(bucket_name, endpoint,index_name): ...@@ -165,8 +165,8 @@ def main(bucket_name, endpoint,index_name):
if __name__== "__main__": if __name__== "__main__":
parser = argparse.ArgumentParser(description="Process PDF documents and store their embeddings.") parser = argparse.ArgumentParser(description="Process PDF documents and store their embeddings.")
parser.add_argument("bucket_name", help="The S3 bucket name where documents are stored") parser.add_argument("--bucket_name", help="The S3 bucket name where documents are stored")
parser.add_argument("endpoint", help="The OpenSearch service endpoint") parser.add_argument("--endpoint", help="The OpenSearch service endpoint")
parser.add_argument("index_name", help="The name of the OpenSearch index") parser.add_argument("--index_name", help="The name of the OpenSearch index")
args = parser.parse_args() args = parser.parse_args()
main(args.bucket_name, args.endpoint, args.index_name) main(args.bucket_name, args.endpoint, args.index_name)
...@@ -7,8 +7,9 @@ from langchain.chains import RetrievalQA ...@@ -7,8 +7,9 @@ from langchain.chains import RetrievalQA
from langchain_community.embeddings import BedrockEmbeddings from langchain_community.embeddings import BedrockEmbeddings
from langchain_community.chat_models import BedrockChat from langchain_community.chat_models import BedrockChat
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from langchain import PromptTemplate from langchain import PromptTemplate
import argparse
# Embeddings Client # Embeddings Client
bedrock_client = boto3.client(service_name="bedrock-runtime") bedrock_client = boto3.client(service_name="bedrock-runtime")
...@@ -24,26 +25,26 @@ st.set_page_config( ...@@ -24,26 +25,26 @@ st.set_page_config(
st.title("Chat with your lecture") st.title("Chat with your lecture")
# AWS and OpenSearch Configuration
host = 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com'
index_name = 'cloud_lecture'
awsauth = AWSV4SignerAuth(boto3.Session().get_credentials(), 'us-east-1', 'aoss')
# OpenSearch Client # OpenSearch Client
opensearch_client = OpenSearch( def ospensearch_client(endpoint):
hosts=[{'host': host, 'port': 443}], awsauth = AWSV4SignerAuth(boto3.Session().get_credentials(), 'us-east-1', 'aoss')
client = OpenSearch(
hosts=[{'host': endpoint, 'port': 443}],
http_auth=awsauth, http_auth=awsauth,
use_ssl=True, use_ssl=True,
verify_certs=True, verify_certs=True,
connection_class=RequestsHttpConnection, connection_class=RequestsHttpConnection,
) )
return client
def get_embedding(question, bedrock_client): def get_embedding(question, bedrock_client):
embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client) embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
embedding = embeddings_model.embed_query(question) embedding = embeddings_model.embed_query(question)
return embedding return embedding
def similarity_search(embed_query, index_name): def similarity_search(client, embed_query, index_name):
query_body = { query_body = {
"size": 5, "size": 5,
"query": { "query": {
...@@ -55,7 +56,7 @@ def similarity_search(embed_query, index_name): ...@@ -55,7 +56,7 @@ def similarity_search(embed_query, index_name):
} }
} }
} }
response = opensearch_client.search(index=index_name, body=query_body) response = client.search(index=index_name, body=query_body)
return response['hits']['hits'] return response['hits']['hits']
def prepare_prompt(question, context): def prepare_prompt(question, context):
...@@ -85,7 +86,9 @@ def generate_answer(prompt): ...@@ -85,7 +86,9 @@ def generate_answer(prompt):
return answer return answer
def main(): def main(endpoint, index_name):
oss_client= ospensearch_client(endpoint)
# initialize chat session in streamlit if not already present # initialize chat session in streamlit if not already present
if "chat_history" not in st.session_state: if "chat_history" not in st.session_state:
...@@ -110,7 +113,7 @@ def main(): ...@@ -110,7 +113,7 @@ def main():
embed_question= get_embedding(user_prompt,bedrock_client) embed_question= get_embedding(user_prompt,bedrock_client)
print(embed_question) print(embed_question)
sim_results = similarity_search(embed_question, index_name) sim_results = similarity_search(oss_client, embed_question, index_name)
context = [i['_source']['text'] for i in sim_results] context = [i['_source']['text'] for i in sim_results]
print(context) print(context)
prompt = prepare_prompt(user_prompt, context) prompt = prepare_prompt(user_prompt, context)
...@@ -122,6 +125,11 @@ def main(): ...@@ -122,6 +125,11 @@ def main():
st.markdown(message["content"]) st.markdown(message["content"])
if __name__== "__main__": if __name__== "__main__":
main() # Argument parsing
parser = argparse.ArgumentParser(description='Configure endpoint and index name for the lecture chat application.')
parser.add_argument('endpoint', type=str, help='The endpoint for the OpenSearch service.')
parser.add_argument('index_name', type=str, help='The index name for storing embeddings.')
args = parser.parse_args()
main(args.endpoint, args.index_name)
# chatbot-lab # chatbot-lab
## Set up environment ## Set up environment
1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to Session 1) 1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to the setup guide provided in Session 1).
2. Ensure python is installed: python 3.8 or higher 2. Ensure python is installed: python 3.8 or higher.
2. Install required python libraries listed in the 'requirements.txt': 3. Install required python libraries listed in the 'requirements.txt':
`pip3 install -r requirements.txt` `pip3 install -r requirements.txt`
## Part 1: ## Part 1:
### Step 1: Object storage Creation ### Step 1: Object storage Creation
Create an S3 bucket and upload a few PDF files (Detailed steps are provided in the first session). Create an S3 bucket and upload a few PDF files by running:
`python create-S3-and-put-docs.py --bucket_name [YourBucketName] --local_path [PathToYourPDFFiles]`
Where:
`--bucket_name`: The name for the new S3 bucket to be created.
`--local_path`: The local directory path where the PDF files are stored.
### Step 2: Vector Store Creation ### Step 2: Vector Store Creation
To set up the Vector Store, run the following command: `python create-vector-db.py` Create a vector database for storing embeddings by running:
`python create-vector-db.py --collection_name [Name_of_colletion] --IAM_user [YourIAM_User]`
Where:
`--collection_name`: Name of the collection that you want to create to store embeddings.
`--IAM_USER` : For example for group 14 the IAM USER = master-group-14
This script performs the following actions: This script performs the following actions:
* Set up the security policies: Sets up encryption, network, and data access policies for collections starting with "test". * Sets up encryption, network, and data access policies for the collection.
* Vector Store Initialization: Creates a vector store named test1, specifically designed for vector search operations. * Creates a vector store with the name collection entered as argument.
* Endpoint Retrieval: After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use. * After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use.
### Step 3: Vectorizing the PDF Files ### Step 3: Vectorizing the PDF Files
After setting up the S3 bucket and Vector Store, prepare to vectorize the PDF files: After setting up the S3 bucket and Vector Store, we could process PDF files to generate and store embeddings in the vector database.
* In main.py, update the S3 bucket name to the one you created.
* Update the Vector Store endpoint with the one provided by the setup script. Run:
* Execute the processing script: `python main.py` `python main.py --bucket_name [YourBucketName] --endpoint [YourVectorDBEndpoint]`
Where:
`--bucket_name`: The name of the S3 bucket containing the PDF files.
`--endpoint`: Endpoint for the vector database.
`--index_name`: The index_name where to store the embeddings in the collection.
The main.py script will: The main.py script will:
1. Download PDF files from the S3 bucket. 1. Download PDF files from the S3 bucket.
2. Split them into chunks. 2. Split them into chunks.
3. Generate embeddings from the chunks. 3. Generate embeddings from the chunks.
4. Store these embeddings in the OpenSearch Vector DB. 4. Create an index in the vector DB.
5. Store these embeddings in the OpenSearch Vector DB.
## Part 2: ## Part 2:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment