From 581a94ce2501b237f0d8a8332c627edad1679d4a Mon Sep 17 00:00:00 2001
From: "abir.chebbi" <abir.chebbi@hes-so.ch>
Date: Thu, 5 Sep 2024 09:11:53 +0200
Subject: [PATCH] Seperate the func generate_store_emb

---
 Part 1/main.py | 64 +++++++++++++++++++++++++++++++++++++-------------
 README.md      | 36 ++++++++++++++++++++++------
 2 files changed, 77 insertions(+), 23 deletions(-)

diff --git a/Part 1/main.py b/Part 1/main.py
index 3909422..e42c3cc 100644
--- a/Part 1/main.py	
+++ b/Part 1/main.py	
@@ -1,14 +1,18 @@
 import boto3
 import os
+from tqdm.auto import tqdm
 from langchain_community.document_loaders import PyPDFDirectoryLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import BedrockEmbeddings
 from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
 from langchain_community.vectorstores import OpenSearchVectorSearch
+import uuid
+import json
+
 
 ##  Local directory for storing PDF files
 LOCAL_DIR = "pdfs" 
-index_name = "cloud_lecture_test"
+index_name = "cloud_lecture_test3"
 
 
 ## S3_client
@@ -25,7 +29,7 @@ credentials = boto3.Session().get_credentials()
 awsauth = AWSV4SignerAuth(credentials, 'us-east-1', 'aoss')
 
 ## Vector DB endpoint
-host= 'ispfynbvy6eov4efdsqd.us-east-1.aoss.amazonaws.com'
+host= 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com'
 
 ## Opensearch Client
 OpenSearch_client = OpenSearch(
@@ -34,7 +38,6 @@ OpenSearch_client = OpenSearch(
     use_ssl=True,
     verify_certs=True,
     connection_class=RequestsHttpConnection,
-    timeout=300
     
 )
 
@@ -83,19 +86,38 @@ def download_documents(bucket_name,local_dir):
 
 
 ## Split pages/text into chunks
-def split_text(pages, chunk_size, chunk_overlap, local_dir):
-    loader= PyPDFDirectoryLoader(local_dir)
-    pages = loader.load_and_split()
+def split_text(docs, chunk_size, chunk_overlap):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-    chunks = text_splitter.split_documents(pages)   
+    chunks = text_splitter.split_documents(docs)   
+
     return chunks
  
-## Generate embeddings and index them using Opensearch
-# def generate_embeddings():
+## Generate embeddings 
+def generate_embeddings(bedrock_client, chunks):
+    embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
+    chunks_list=[chunk.page_content for chunk in chunks]
+    embeddings = embeddings_model.embed_documents(chunks_list)
+    return embeddings
+
+# Store generated embeddings into an OpenSearch index.
+def store_embeddings(embeddings, texts, meta_data, host, awsauth, index_name):  
+    docsearch = OpenSearchVectorSearch.from_embeddings(
+        embeddings,
+        texts,
+        meta_data,
+        opensearch_url=f'https://{host}:443',
+        http_auth=awsauth,
+        use_ssl=True,
+        verify_certs=True,
+        connection_class=RequestsHttpConnection,
+        index_name=index_name,
+        bulk_size=1000
+)
 
-# def store_embeddings():
+    return docsearch
 
 
+# Func to do both generating and storing embeddings
 def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name):
     embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
     docsearch = OpenSearchVectorSearch.from_documents(
@@ -117,13 +139,23 @@ def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name):
 ## main 
 def main():
 
-    docs= download_documents(BUCKET_NAME,LOCAL_DIR)
-    chunks=split_text(docs, 1000, 100, LOCAL_DIR)
-    print("Sample chunk:", chunks[0])
-    create_index(index_name)
-    embeddings = generate_store_embeddings(bedrock_client, chunks,awsauth,index_name)
-    print("Embeddings processing completed", embeddings)
+    download_documents(BUCKET_NAME,LOCAL_DIR)
+    loader= PyPDFDirectoryLoader(LOCAL_DIR)
+    docs = loader.load()
+    print(docs[80])
+    chunks = split_text(docs, 1000, 100)
+    print(chunks[80])
+    embeddings= generate_embeddings(bedrock_client, chunks)
+    print(embeddings[80])
+    texts = [chunk.page_content for chunk in chunks]
+     # Prepare metadata for each chunk
+    meta_data = [{'source': chunk.metadata['source'], 'page': chunk.metadata['page'] + 1} for chunk in chunks]
+    print(embeddings[80])
+    print(meta_data[80])
+    store_embeddings(embeddings, texts, meta_data ,host, awsauth,index_name)
+
 
+   
 
   
 
diff --git a/README.md b/README.md
index 41ca5aa..d5fad60 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,36 @@
 # chatbot-lab
 
+## Set up environment
+1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to Session 1)
+2. Ensure python is installed: python 3.8 or higher
+2. Install required python libraries listed in the 'requirements.txt': 
+`pip install -r requirement.txt`
 
 
-## Set up environment
-1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop.
-    * Install [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html).
-    * Configure the CLI with your AWS credentials: 'aws configure'
-2. Ensure python is installed.
-2. Install required python libraries listed in the 'requirements.txt': pip install -r requirement.txt
+## Part 1: 
+
+### Step 1: Create S3 Bucket
+Create an S3 bucket and upload a few PDF files (Detailed steps are provided in the first session).
+
+### Step 2: Vector Store Creation
+To set up the Vector Store, run the following command:
+`python Create-Vector-DB.py`
+
+This script performs the following actions:
+* Set up the security policies: Sets up encryption, network, and data access policies for collections starting with "test".
+
+* Vector Store Initialization: Creates a vector store named test1, specifically designed for vector search operations.
 
-## 
+* Endpoint Retrieval: After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use.
 
+### Step 3: Processing PDF Files
+After setting up the S3 bucket and Vector Store, prepare to vectorize the PDF files:
+* In main.py, update the S3 bucket name to the one you created.
+* Update the Vector Store endpoint with the one provided by the setup script.
+* Execute the processing script: `python main.py`
 
+The main.py script will:
+1. Download PDF files from the S3 bucket.
+2. Split them into chunks.
+3. Generate embeddings from the chunks.
+4. Store these embeddings in the OpenSearch Vector DB.
\ No newline at end of file
-- 
GitLab