Seperate the func generate_store_emb

581a94ce · abir.chebbi · 561267ae · 581a94ce · 581a94ce
Commit 581a94ce authored 9 months ago by abir.chebbi
--- a/Part 1/main.py
+++ b/Part 1/main.py
 import boto3
 import os
+from tqdm.auto import tqdm
 from langchain_community.document_loaders import PyPDFDirectoryLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import BedrockEmbeddings
 from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
 from langchain_community.vectorstores import OpenSearchVectorSearch
+import uuid
+import json
+

 ##  Local directory for storing PDF files
 LOCAL_DIR = "pdfs" 
-index_name = "cloud_lecture_test"
+index_name = "cloud_lecture_test3"


 ## S3_client
@@ -25,7 +29,7 @@ credentials = boto3.Session().get_credentials()
 awsauth = AWSV4SignerAuth(credentials, 'us-east-1', 'aoss')

 ## Vector DB endpoint
-host= 'ispfynbvy6eov4efdsqd.us-east-1.aoss.amazonaws.com'
+host= 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com'

 ## Opensearch Client
 OpenSearch_client = OpenSearch(
@@ -34,7 +38,6 @@ OpenSearch_client = OpenSearch(
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
-    timeout=300
    
 )

@@ -83,19 +86,38 @@ def download_documents(bucket_name,local_dir):


 ## Split pages/text into chunks
-def split_text(pages, chunk_size, chunk_overlap, local_dir):
-    loader= PyPDFDirectoryLoader(local_dir)
-    pages = loader.load_and_split()
+def split_text(docs, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-    chunks = text_splitter.split_documents(pages)   
+    chunks = text_splitter.split_documents(docs)   
+
    return chunks
 
-## Generate embeddings and index them using Opensearch
-# def generate_embeddings():
+## Generate embeddings 
+def generate_embeddings(bedrock_client, chunks):
+    embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
+    chunks_list=[chunk.page_content for chunk in chunks]
+    embeddings = embeddings_model.embed_documents(chunks_list)
+    return embeddings
+
+# Store generated embeddings into an OpenSearch index.
+def store_embeddings(embeddings, texts, meta_data, host, awsauth, index_name):  
+    docsearch = OpenSearchVectorSearch.from_embeddings(
+        embeddings,
+        texts,
+        meta_data,
+        opensearch_url=f'https://{host}:443',
+        http_auth=awsauth,
+        use_ssl=True,
+        verify_certs=True,
+        connection_class=RequestsHttpConnection,
+        index_name=index_name,
+        bulk_size=1000
+)

-# def store_embeddings():
+    return docsearch


+# Func to do both generating and storing embeddings
 def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name):
    embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
    docsearch = OpenSearchVectorSearch.from_documents(
@@ -117,12 +139,22 @@ def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name):
 ## main 
 def main():

-    docs= download_documents(BUCKET_NAME,LOCAL_DIR)
-    chunks=split_text(docs, 1000, 100, LOCAL_DIR)
-    print("Sample chunk:", chunks[0])
-    create_index(index_name)
-    embeddings = generate_store_embeddings(bedrock_client, chunks,awsauth,index_name)
-    print("Embeddings processing completed", embeddings)
+    download_documents(BUCKET_NAME,LOCAL_DIR)
+    loader= PyPDFDirectoryLoader(LOCAL_DIR)
+    docs = loader.load()
+    print(docs[80])
+    chunks = split_text(docs, 1000, 100)
+    print(chunks[80])
+    embeddings= generate_embeddings(bedrock_client, chunks)
+    print(embeddings[80])
+    texts = [chunk.page_content for chunk in chunks]
+     # Prepare metadata for each chunk
+    meta_data = [{'source': chunk.metadata['source'], 'page': chunk.metadata['page'] + 1} for chunk in chunks]
+    print(embeddings[80])
+    print(meta_data[80])
+    store_embeddings(embeddings, texts, meta_data ,host, awsauth,index_name)
+
+
   

  

--- a/README.md
+++ b/README.md
 # chatbot-lab

+## Set up environment
+1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to Session 1)
+2. Ensure python is installed: python 3.8 or higher
+2. Install required python libraries listed in the 'requirements.txt': 
+`pip install -r requirement.txt`


-## Set up environment
-1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop.
-    * Install [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html).
-    * Configure the CLI with your AWS credentials: 'aws configure'
-2. Ensure python is installed.
-2. Install required python libraries listed in the 'requirements.txt': pip install -r requirement.txt
+## Part 1: 
+
+### Step 1: Create S3 Bucket
+Create an S3 bucket and upload a few PDF files (Detailed steps are provided in the first session).
+
+### Step 2: Vector Store Creation
+To set up the Vector Store, run the following command:
+`python Create-Vector-DB.py`
+
+This script performs the following actions:
+* Set up the security policies: Sets up encryption, network, and data access policies for collections starting with "test".
+
+* Vector Store Initialization: Creates a vector store named test1, specifically designed for vector search operations.

-## 
+* Endpoint Retrieval: After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use.

+### Step 3: Processing PDF Files
+After setting up the S3 bucket and Vector Store, prepare to vectorize the PDF files:
+* In main.py, update the S3 bucket name to the one you created.
+* Update the Vector Store endpoint with the one provided by the setup script.
+* Execute the processing script: `python main.py`

+The main.py script will:
+1. Download PDF files from the S3 bucket.
+2. Split them into chunks.
+3. Generate embeddings from the chunks.
+4. Store these embeddings in the OpenSearch Vector DB.
\ No newline at end of file