From 581a94ce2501b237f0d8a8332c627edad1679d4a Mon Sep 17 00:00:00 2001 From: "abir.chebbi" <abir.chebbi@hes-so.ch> Date: Thu, 5 Sep 2024 09:11:53 +0200 Subject: [PATCH] Seperate the func generate_store_emb --- Part 1/main.py | 64 +++++++++++++++++++++++++++++++++++++------------- README.md | 36 ++++++++++++++++++++++------ 2 files changed, 77 insertions(+), 23 deletions(-) diff --git a/Part 1/main.py b/Part 1/main.py index 3909422..e42c3cc 100644 --- a/Part 1/main.py +++ b/Part 1/main.py @@ -1,14 +1,18 @@ import boto3 import os +from tqdm.auto import tqdm from langchain_community.document_loaders import PyPDFDirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import BedrockEmbeddings from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth from langchain_community.vectorstores import OpenSearchVectorSearch +import uuid +import json + ## Local directory for storing PDF files LOCAL_DIR = "pdfs" -index_name = "cloud_lecture_test" +index_name = "cloud_lecture_test3" ## S3_client @@ -25,7 +29,7 @@ credentials = boto3.Session().get_credentials() awsauth = AWSV4SignerAuth(credentials, 'us-east-1', 'aoss') ## Vector DB endpoint -host= 'ispfynbvy6eov4efdsqd.us-east-1.aoss.amazonaws.com' +host= 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com' ## Opensearch Client OpenSearch_client = OpenSearch( @@ -34,7 +38,6 @@ OpenSearch_client = OpenSearch( use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection, - timeout=300 ) @@ -83,19 +86,38 @@ def download_documents(bucket_name,local_dir): ## Split pages/text into chunks -def split_text(pages, chunk_size, chunk_overlap, local_dir): - loader= PyPDFDirectoryLoader(local_dir) - pages = loader.load_and_split() +def split_text(docs, chunk_size, chunk_overlap): text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) - chunks = text_splitter.split_documents(pages) + chunks = text_splitter.split_documents(docs) + return chunks -## Generate embeddings and index them using Opensearch -# def generate_embeddings(): +## Generate embeddings +def generate_embeddings(bedrock_client, chunks): + embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client) + chunks_list=[chunk.page_content for chunk in chunks] + embeddings = embeddings_model.embed_documents(chunks_list) + return embeddings + +# Store generated embeddings into an OpenSearch index. +def store_embeddings(embeddings, texts, meta_data, host, awsauth, index_name): + docsearch = OpenSearchVectorSearch.from_embeddings( + embeddings, + texts, + meta_data, + opensearch_url=f'https://{host}:443', + http_auth=awsauth, + use_ssl=True, + verify_certs=True, + connection_class=RequestsHttpConnection, + index_name=index_name, + bulk_size=1000 +) -# def store_embeddings(): + return docsearch +# Func to do both generating and storing embeddings def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name): embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client) docsearch = OpenSearchVectorSearch.from_documents( @@ -117,13 +139,23 @@ def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name): ## main def main(): - docs= download_documents(BUCKET_NAME,LOCAL_DIR) - chunks=split_text(docs, 1000, 100, LOCAL_DIR) - print("Sample chunk:", chunks[0]) - create_index(index_name) - embeddings = generate_store_embeddings(bedrock_client, chunks,awsauth,index_name) - print("Embeddings processing completed", embeddings) + download_documents(BUCKET_NAME,LOCAL_DIR) + loader= PyPDFDirectoryLoader(LOCAL_DIR) + docs = loader.load() + print(docs[80]) + chunks = split_text(docs, 1000, 100) + print(chunks[80]) + embeddings= generate_embeddings(bedrock_client, chunks) + print(embeddings[80]) + texts = [chunk.page_content for chunk in chunks] + # Prepare metadata for each chunk + meta_data = [{'source': chunk.metadata['source'], 'page': chunk.metadata['page'] + 1} for chunk in chunks] + print(embeddings[80]) + print(meta_data[80]) + store_embeddings(embeddings, texts, meta_data ,host, awsauth,index_name) + + diff --git a/README.md b/README.md index 41ca5aa..d5fad60 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,36 @@ # chatbot-lab +## Set up environment +1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to Session 1) +2. Ensure python is installed: python 3.8 or higher +2. Install required python libraries listed in the 'requirements.txt': +`pip install -r requirement.txt` -## Set up environment -1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop. - * Install [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html). - * Configure the CLI with your AWS credentials: 'aws configure' -2. Ensure python is installed. -2. Install required python libraries listed in the 'requirements.txt': pip install -r requirement.txt +## Part 1: + +### Step 1: Create S3 Bucket +Create an S3 bucket and upload a few PDF files (Detailed steps are provided in the first session). + +### Step 2: Vector Store Creation +To set up the Vector Store, run the following command: +`python Create-Vector-DB.py` + +This script performs the following actions: +* Set up the security policies: Sets up encryption, network, and data access policies for collections starting with "test". + +* Vector Store Initialization: Creates a vector store named test1, specifically designed for vector search operations. -## +* Endpoint Retrieval: After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use. +### Step 3: Processing PDF Files +After setting up the S3 bucket and Vector Store, prepare to vectorize the PDF files: +* In main.py, update the S3 bucket name to the one you created. +* Update the Vector Store endpoint with the one provided by the setup script. +* Execute the processing script: `python main.py` +The main.py script will: +1. Download PDF files from the S3 bucket. +2. Split them into chunks. +3. Generate embeddings from the chunks. +4. Store these embeddings in the OpenSearch Vector DB. \ No newline at end of file -- GitLab