From eb339d0ff4449bb2148d15f1ba6a784e36988a5f Mon Sep 17 00:00:00 2001
From: "abir.chebbi" <abir.chebbi@hes-so.ch>
Date: Mon, 2 Sep 2024 15:46:24 +0200
Subject: [PATCH] add scipts

---
 Part 1/Create-Vector-DB.py       | 155 +++++++++++++++++++++++++++++++
 Part 1/Delete-Vector-DB.py       |   0
 Part 1/Delete-s3.py              |  24 +++++
 Part 1/create-S3-and-put-docs.py |  35 +++++++
 Part 1/main.py                   |  12 ++-
 5 files changed, 222 insertions(+), 4 deletions(-)
 create mode 100644 Part 1/Create-Vector-DB.py
 create mode 100644 Part 1/Delete-Vector-DB.py
 create mode 100644 Part 1/Delete-s3.py
 create mode 100644 Part 1/create-S3-and-put-docs.py

diff --git a/Part 1/Create-Vector-DB.py b/Part 1/Create-Vector-DB.py
new file mode 100644
index 0000000..4e63c5e
--- /dev/null
+++ b/Part 1/Create-Vector-DB.py	
@@ -0,0 +1,155 @@
+## Source: https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-sdk.html
+import boto3
+import botocore
+import time
+
+
+client = boto3.client('opensearchserverless')
+service = 'aoss'
+Vector_store_name='test1'
+
+def createEncryptionPolicy(client):
+    """Creates an encryption policy that matches all collections beginning with test"""
+    try:
+        response = client.create_security_policy(
+            description='Encryption policy for test collections',
+            name='test-policy',
+            policy="""
+                {
+                    \"Rules\":[
+                        {
+                            \"ResourceType\":\"collection\",
+                            \"Resource\":[
+                                \"collection\/test*\"
+                            ]
+                        }
+                    ],
+                    \"AWSOwnedKey\":true
+                }
+                """,
+            type='encryption'
+        )
+        print('\nEncryption policy created:')
+        print(response)
+    except botocore.exceptions.ClientError as error:
+        if error.response['Error']['Code'] == 'ConflictException':
+            print(
+                '[ConflictException] The policy name or rules conflict with an existing policy.')
+        else:
+            raise error
+
+
+def createNetworkPolicy(client):
+    """Creates a network policy that matches all collections beginning with test"""
+    try:
+        response = client.create_security_policy(
+            description='Network policy for Test collections',
+            name='test-policy',
+            policy="""
+                [{
+                    \"Description\":\"Public access for Test collection\",
+                    \"Rules\":[
+                        {
+                            \"ResourceType\":\"dashboard\",
+                            \"Resource\":[\"collection\/test*\"]
+                        },
+                        {
+                            \"ResourceType\":\"collection\",
+                            \"Resource\":[\"collection\/test*\"]
+                        }
+                    ],
+                    \"AllowFromPublic\":true
+                }]
+                """,
+            type='network'
+        )
+        print('\nNetwork policy created:')
+        print(response)
+    except botocore.exceptions.ClientError as error:
+        if error.response['Error']['Code'] == 'ConflictException':
+            print(
+                '[ConflictException] A network policy with this name already exists.')
+        else:
+            raise error
+
+
+def createAccessPolicy(client):
+    """Creates a data access policy that matches all collections beginning with test"""
+    try:
+        response = client.create_access_policy(
+            description='Data access policy for Test collections',
+            name='test-policy',
+            policy="""
+                [{
+                    \"Rules\":[
+                        {
+                            \"Resource\":[
+                                \"index\/test*\/*\"
+                            ],
+                            \"Permission\":[
+                                \"aoss:CreateIndex\",
+                                \"aoss:DeleteIndex\",
+                                \"aoss:UpdateIndex\",
+                                \"aoss:DescribeIndex\",
+                                \"aoss:ReadDocument\",
+                                \"aoss:WriteDocument\"
+                            ],
+                            \"ResourceType\": \"index\"
+                        },
+                        {
+                            \"Resource\":[
+                                \"collection\/test*\"
+                            ],
+                            \"Permission\":[
+                                \"aoss:CreateCollectionItems\",
+                                \"aoss:DeleteCollectionItems\",
+                                \"aoss:UpdateCollectionItems\",
+                                \"aoss:DescribeCollectionItems\"
+                            ],
+                            \"ResourceType\": \"collection\"
+                        }
+                    ],
+                    \"Principal\":[
+                        \"arn:aws:iam::768034348959:user/AbirChebbi\"
+                    ]
+                }]
+                """,
+            type='data'
+        )
+        print('\nAccess policy created:')
+        print(response)
+    except botocore.exceptions.ClientError as error:
+        if error.response['Error']['Code'] == 'ConflictException':
+            print(
+                '[ConflictException] An access policy with this name already exists.')
+        else:
+            raise error
+        
+
+        
+def waitForCollectionCreation(client):
+    """Waits for the collection to become active"""
+    time.sleep(40)
+    response = client.batch_get_collection(
+            names=['test1'])
+    print('\nCollection successfully created:')
+    print(response["collectionDetails"])
+    # Extract the collection endpoint from the response
+    host = (response['collectionDetails'][0]['collectionEndpoint'])
+    final_host = host.replace("https://", "")
+    return final_host
+
+
+def main():
+
+    createEncryptionPolicy(client)
+    createNetworkPolicy(client)
+    createAccessPolicy(client)
+    collection = client.create_collection(name=Vector_store_name,type='VECTORSEARCH')
+    ENDPOINT= waitForCollectionCreation(client)
+
+    print("Collection created successfully:", collection)
+    print("Collection ENDPOINT:", ENDPOINT)
+
+if __name__== "__main__":
+    main()
\ No newline at end of file
diff --git a/Part 1/Delete-Vector-DB.py b/Part 1/Delete-Vector-DB.py
new file mode 100644
index 0000000..e69de29
diff --git a/Part 1/Delete-s3.py b/Part 1/Delete-s3.py
new file mode 100644
index 0000000..7d07f21
--- /dev/null
+++ b/Part 1/Delete-s3.py	
@@ -0,0 +1,24 @@
+import boto3
+
+BUCKET_NAME = 'cloud-lecture-2023'
+
+S3_CLIENT = boto3.client('s3')
+S3_RESOURCE = boto3.resource('s3')
+
+# # # Delete Bucket
+
+# First, delete all objects in the Bucket
+bucket = S3_RESOURCE.Bucket(BUCKET_NAME)
+
+print("Deleting all objects in Bucket\n")
+bucket.objects.all().delete()
+
+
+print("Deleting Bucket")
+# Bucket Deletion
+response = S3_CLIENT.delete_bucket(
+    Bucket=BUCKET_NAME 
+
+)
+
+print(response)
diff --git a/Part 1/create-S3-and-put-docs.py b/Part 1/create-S3-and-put-docs.py
new file mode 100644
index 0000000..7587e0c
--- /dev/null
+++ b/Part 1/create-S3-and-put-docs.py	
@@ -0,0 +1,35 @@
+import boto3
+import os
+
+LOCAL_DIR = "pdfs"
+BUCKET_NAME = 'cloud-lecture-2023'
+
+# Initiate S3 client
+s3_client = boto3.client('s3')
+
+# Create S3 Bucket
+print("Creating Bucket")
+response = s3_client.create_bucket(
+    Bucket=BUCKET_NAME,
+)
+print(response)
+print()
+
+# Function to write files to S3
+def write_files(directory, bucket):
+    for filename in os.listdir(directory):
+        if filename.endswith(".pdf"):  # Check if the file is a PDF
+            file_path = os.path.join(directory, filename)
+            with open(file_path, 'rb') as file:
+                print(f"Uploading {filename} to bucket {bucket}...")
+                s3_client.put_object(
+                    Body=file,
+                    Bucket=bucket,
+                    Key=filename
+                )
+                print(f"{filename} uploaded successfully.")
+
+# Upload PDF files to S3 bucket
+print("Writing Items to Bucket")
+write_files(LOCAL_DIR, BUCKET_NAME)
+
diff --git a/Part 1/main.py b/Part 1/main.py
index b318c01..f12dafd 100644
--- a/Part 1/main.py	
+++ b/Part 1/main.py	
@@ -14,7 +14,7 @@ index_name = "cloud_lecture_test"
 ## S3_client
 s3_client = boto3.client('s3')
 ## Bucket name where documents are stored
-BUCKET_NAME = "chatbotlab"
+BUCKET_NAME = "cloud-lecture-2023"
 
 ## Bedrock client
 bedrock_client = boto3.client(service_name="bedrock-runtime")
@@ -25,7 +25,7 @@ credentials = boto3.Session().get_credentials()
 awsauth = AWSV4SignerAuth(credentials, 'us-east-1', 'aoss')
 
 ## Vector DB endpoint
-host= 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com'
+host= 'ispfynbvy6eov4efdsqd.us-east-1.aoss.amazonaws.com'
 
 ## Opensearch Client
 OpenSearch_client = OpenSearch(
@@ -91,8 +91,12 @@ def split_text(pages, chunk_size, chunk_overlap, local_dir):
     return chunks
  
 ## Generate embeddings and index them using Opensearch
+# def generate_embeddings():
 
-def generate_embeddings(bedrock_client, chunks,awsauth,index_name):
+# def store_embeddings():
+
+
+def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name):
     embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
     docsearch = OpenSearchVectorSearch.from_documents(
         chunks,
@@ -117,7 +121,7 @@ def main():
     chunks=split_text(docs, 1000, 100, LOCAL_DIR)
     print("Sample chunk:", chunks[0])
     create_index(index_name)
-    embeddings = generate_embeddings(bedrock_client, chunks,awsauth,index_name)
+    embeddings = generate_store_embeddings(bedrock_client, chunks,awsauth,index_name)
     print("Embeddings processing completed", embeddings)
 
 
-- 
GitLab