Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
chatbot-lab
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
LSDS
Teaching
Master
Cloud
chatbot-lab
Commits
a5728da8
Commit
a5728da8
authored
8 months ago
by
abir.chebbi
Browse files
Options
Downloads
Patches
Plain Diff
adjust the creation of the vectorDB
parent
65c2ecaa
No related branches found
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
Part 1/create-S3-and-put-docs.py
+19
-15
19 additions, 15 deletions
Part 1/create-S3-and-put-docs.py
Part 1/create-vector-db.py
+88
-85
88 additions, 85 deletions
Part 1/create-vector-db.py
Part 1/main.py
+27
-21
27 additions, 21 deletions
Part 1/main.py
Part 2/main.py
+1
-0
1 addition, 0 deletions
Part 2/main.py
with
135 additions
and
121 deletions
Part 1/create-S3-and-put-docs.py
+
19
−
15
View file @
a5728da8
import
boto3
import
boto3
import
os
import
os
import
argparse
LOCAL_DIR
=
"
pdfs
"
BUCKET_NAME
=
'
cloud-lecture-nabil-2024-25
'
# Initiate S3 client
def
create_bucket
(
s3_client
,
bucket_name
):
s3_client
=
boto3
.
client
(
'
s3
'
)
"""
Create an S3 bucket
"""
print
(
"
Creating Bucket
"
)
response
=
s3_client
.
create_bucket
(
Bucket
=
bucket_name
)
print
(
response
)
print
()
# Create S3 Bucket
print
(
"
Creating Bucket
"
)
response
=
s3_client
.
create_bucket
(
Bucket
=
BUCKET_NAME
,
)
print
(
response
)
print
()
# Function to write files to S3
# Function to write files to S3
def
write_files
(
directory
,
bucket
):
def
write_files
(
s3_client
,
directory
,
bucket
):
for
filename
in
os
.
listdir
(
directory
):
for
filename
in
os
.
listdir
(
directory
):
if
filename
.
endswith
(
"
.pdf
"
):
# Check if the file is a PDF
if
filename
.
endswith
(
"
.pdf
"
):
# Check if the file is a PDF
file_path
=
os
.
path
.
join
(
directory
,
filename
)
file_path
=
os
.
path
.
join
(
directory
,
filename
)
...
@@ -28,8 +24,16 @@ def write_files(directory, bucket):
...
@@ -28,8 +24,16 @@ def write_files(directory, bucket):
Key
=
filename
Key
=
filename
)
)
print
(
f
"
{
filename
}
uploaded successfully.
"
)
print
(
f
"
{
filename
}
uploaded successfully.
"
)
def
main
(
bucket_name
,
local_dir
):
s3_client
=
boto3
.
client
(
'
s3
'
)
create_bucket
(
s3_client
,
bucket_name
)
write_files
(
s3_client
,
local_dir
,
bucket_name
)
# Upload PDF files to S3 bucket
if
__name__
==
"
__main__
"
:
print
(
"
Writing Items to Bucket
"
)
parser
=
argparse
.
ArgumentParser
(
description
=
"
Upload PDF files to an S3 bucket
"
)
write_files
(
LOCAL_DIR
,
BUCKET_NAME
)
parser
.
add_argument
(
"
bucket_name
"
,
help
=
"
The name of the S3 bucket to which the files will be uploaded
"
)
parser
.
add_argument
(
"
LOCAL_DIR
"
,
help
=
"
The name of the folder to put the pdf files
"
)
args
=
parser
.
parse_args
()
main
(
args
.
bucket_name
,
args
.
LOCAL_DIR
)
This diff is collapsed.
Click to expand it.
Part 1/create-vector-db.py
+
88
−
85
View file @
a5728da8
...
@@ -2,30 +2,30 @@
...
@@ -2,30 +2,30 @@
import
boto3
import
boto3
import
botocore
import
botocore
import
time
import
time
import
argparse
client
=
boto3
.
client
(
'
opensearchserverless
'
)
client
=
boto3
.
client
(
'
opensearchserverless
'
)
#service = 'aoss'
#service = 'aoss'
Vector_store_name
=
'
test-nabil
'
def
createEncryptionPolicy
(
client
):
def
createEncryptionPolicy
(
client
,
policy_name
,
collection_name
):
"""
Creates an encryption policy
that matches all collections beginning with test
"""
"""
Creates an encryption policy
for the specified collection.
"""
try
:
try
:
response
=
client
.
create_security_policy
(
response
=
client
.
create_security_policy
(
description
=
'
Encryption policy for
test
collection
s
'
,
description
=
f
'
Encryption policy for
{
collection
_name
}
'
,
name
=
'
test-
policy
'
,
name
=
policy
_name
,
policy
=
"""
policy
=
f
"""
{
{
{
\"
Rules
\"
:[
\"
Rules
\"
:
[
{
{
{
\"
ResourceType
\"
:
\"
collection
\"
,
\"
ResourceType
\"
:
\"
collection
\"
,
\"
Resource
\"
:[
\"
Resource
\"
:
[
\"
collection
\/test*
\"
\"
collection
/
{
collection_name
}
\"
]
]
}
}
}
],
],
\"
AWSOwnedKey
\"
:true
\"
AWSOwnedKey
\"
:
true
}
}
}
"""
,
"""
,
type
=
'
encryption
'
type
=
'
encryption
'
)
)
...
@@ -39,27 +39,27 @@ def createEncryptionPolicy(client):
...
@@ -39,27 +39,27 @@ def createEncryptionPolicy(client):
raise
error
raise
error
def
createNetworkPolicy
(
client
):
def
createNetworkPolicy
(
client
,
policy_name
,
collection_name
):
"""
Creates a network policy
that matches all collections beginning with test
"""
"""
Creates a network policy
for the specified collection.
"""
try
:
try
:
response
=
client
.
create_security_policy
(
response
=
client
.
create_security_policy
(
description
=
'
Network policy for
Test
collection
s
'
,
description
=
f
'
Network policy for
{
collection
_name
}
'
,
name
=
'
test-
policy
'
,
name
=
policy
_name
,
policy
=
"""
policy
=
f
"""
[{
[{
{
\"
Description
\"
:
\"
Public access for
Test
collection
\"
,
\"
Description
\"
:
\"
Public access for
{
collection
_name
}
\"
,
\"
Rules
\"
:[
\"
Rules
\"
:
[
{
{
{
\"
ResourceType
\"
:
\"
dashboard
\"
,
\"
ResourceType
\"
:
\"
dashboard
\"
,
\"
Resource
\"
:[
\"
collection
\/test*
\"
]
\"
Resource
\"
:
[
\"
collection
/
{
collection_name
}
\"
]
},
}
},
{
{
{
\"
ResourceType
\"
:
\"
collection
\"
,
\"
ResourceType
\"
:
\"
collection
\"
,
\"
Resource
\"
:[
\"
collection
\/test*
\"
]
\"
Resource
\"
:
[
\"
collection
/
{
collection_name
}
\"
]
}
}
}
],
],
\"
AllowFromPublic
\"
:true
\"
AllowFromPublic
\"
:
true
}]
}
}]
"""
,
"""
,
type
=
'
network
'
type
=
'
network
'
)
)
...
@@ -73,65 +73,62 @@ def createNetworkPolicy(client):
...
@@ -73,65 +73,62 @@ def createNetworkPolicy(client):
raise
error
raise
error
def
createAccessPolicy
(
client
):
def
createAccessPolicy
(
client
,
policy_name
,
collection_name
,
IAM_USER
):
"""
Creates a data access policy
that matches all collections beginning with test
"""
"""
Creates a data access policy
for the specified collection.
"""
try
:
try
:
policy_content
=
f
"""
[
{{
"
Rules
"
: [
{{
"
Resource
"
: [
"
collection/
{
collection_name
}
"
],
"
Permission
"
: [
"
aoss:CreateCollectionItems
"
,
"
aoss:DeleteCollectionItems
"
,
"
aoss:UpdateCollectionItems
"
,
"
aoss:DescribeCollectionItems
"
],
"
ResourceType
"
:
"
collection
"
}},
{{
"
Resource
"
: [
"
index/
{
collection_name
}
/*
"
],
"
Permission
"
: [
"
aoss:CreateIndex
"
,
"
aoss:DeleteIndex
"
,
"
aoss:UpdateIndex
"
,
"
aoss:DescribeIndex
"
,
"
aoss:ReadDocument
"
,
"
aoss:WriteDocument
"
],
"
ResourceType
"
:
"
index
"
}}
],
"
Principal
"
: [
"
arn:aws:iam::352909266144:user/
{
IAM_USER
}
"
]
}}
]
"""
response
=
client
.
create_access_policy
(
response
=
client
.
create_access_policy
(
description
=
'
Data access policy for Test collections
'
,
description
=
f
'
Data access policy for
{
collection_name
}
'
,
name
=
'
test-policy
'
,
name
=
policy_name
,
policy
=
"""
policy
=
policy_content
,
[{
\"
Rules
\"
:[
{
\"
Resource
\"
:[
\"
index\/test*\/*
\"
],
\"
Permission
\"
:[
\"
aoss:CreateIndex
\"
,
\"
aoss:DeleteIndex
\"
,
\"
aoss:UpdateIndex
\"
,
\"
aoss:DescribeIndex
\"
,
\"
aoss:ReadDocument
\"
,
\"
aoss:WriteDocument
\"
],
\"
ResourceType
\"
:
\"
index
\"
},
{
\"
Resource
\"
:[
\"
collection\/test*
\"
],
\"
Permission
\"
:[
\"
aoss:CreateCollectionItems
\"
,
\"
aoss:DeleteCollectionItems
\"
,
\"
aoss:UpdateCollectionItems
\"
,
\"
aoss:DescribeCollectionItems
\"
],
\"
ResourceType
\"
:
\"
collection
\"
}
],
\"
Principal
\"
:[
\"
arn:aws:iam::768034348959:user/AbirChebbi
\"
]
}]
"""
,
type
=
'
data
'
type
=
'
data
'
)
)
print
(
'
\n
Access policy created:
'
)
print
(
'
\n
Access policy created:
'
)
print
(
response
)
print
(
response
)
except
botocore
.
exceptions
.
ClientError
as
error
:
except
botocore
.
exceptions
.
ClientError
as
error
:
if
error
.
response
[
'
Error
'
][
'
Code
'
]
==
'
ConflictException
'
:
if
error
.
response
[
'
Error
'
][
'
Code
'
]
==
'
ConflictException
'
:
print
(
print
(
'
[ConflictException] An access policy with this name already exists.
'
)
'
[ConflictException] An access policy with this name already exists.
'
)
else
:
else
:
raise
error
raise
error
def
waitForCollectionCreation
(
client
):
def
waitForCollectionCreation
(
client
,
collection_name
):
"""
Waits for the collection to become active
"""
"""
Waits for the collection to become active
"""
time
.
sleep
(
4
0
)
time
.
sleep
(
3
0
)
response
=
client
.
batch_get_collection
(
response
=
client
.
batch_get_collection
(
names
=
[
'
test1
'
])
names
=
[
collection_name
])
print
(
'
\n
Collection successfully created:
'
)
print
(
'
\n
Collection successfully created:
'
)
print
(
response
[
"
collectionDetails
"
])
print
(
response
[
"
collectionDetails
"
])
# Extract the collection endpoint from the response
# Extract the collection endpoint from the response
...
@@ -140,16 +137,22 @@ def waitForCollectionCreation(client):
...
@@ -140,16 +137,22 @@ def waitForCollectionCreation(client):
return
final_host
return
final_host
def
main
():
def
main
(
collection_name
,
IAM_USER
):
encryption_policy_name
=
f
'
{
collection_name
}
-encryption-policy
'
createEncryptionPolicy
(
client
)
network_policy_name
=
f
'
{
collection_name
}
-network-policy
'
createNetworkPolicy
(
client
)
access_policy_name
=
f
'
{
collection_name
}
-access-policy
'
createAccessPolicy
(
client
)
createEncryptionPolicy
(
client
,
encryption_policy_name
,
collection_name
)
collection
=
client
.
create_collection
(
name
=
Vector_store_name
,
type
=
'
VECTORSEARCH
'
)
createNetworkPolicy
(
client
,
network_policy_name
,
collection_name
)
ENDPOINT
=
waitForCollectionCreation
(
client
)
createAccessPolicy
(
client
,
access_policy_name
,
collection_name
,
IAM_USER
)
collection
=
client
.
create_collection
(
name
=
collection_name
,
type
=
'
VECTORSEARCH
'
)
ENDPOINT
=
waitForCollectionCreation
(
client
,
collection_name
)
print
(
"
Collection created successfully:
"
,
collection
)
print
(
"
Collection created successfully:
"
,
collection
)
print
(
"
Collection ENDPOINT:
"
,
ENDPOINT
)
print
(
"
Collection ENDPOINT:
"
,
ENDPOINT
)
if
__name__
==
"
__main__
"
:
if
__name__
==
"
__main__
"
:
main
()
parser
=
argparse
.
ArgumentParser
(
description
=
"
Create collection
"
)
\ No newline at end of file
parser
.
add_argument
(
"
collection_name
"
,
help
=
"
The name of the collection
"
)
parser
.
add_argument
(
"
iam_user
"
,
help
=
"
The iam user
"
)
args
=
parser
.
parse_args
()
main
(
args
.
collection_name
,
args
.
iam_user
)
This diff is collapsed.
Click to expand it.
Part 1/main.py
+
27
−
21
View file @
a5728da8
...
@@ -8,41 +8,30 @@ from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
...
@@ -8,41 +8,30 @@ from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from
langchain_community.vectorstores
import
OpenSearchVectorSearch
from
langchain_community.vectorstores
import
OpenSearchVectorSearch
import
uuid
import
uuid
import
json
import
json
import
argparse
## Local directory for storing PDF files
## Local directory for storing PDF files
LOCAL_DIR
=
"
pdfs
"
LOCAL_DIR
=
"
pdfs
"
index_name
=
"
cloud_lecture
"
## S3_client
## S3_client
s3_client
=
boto3
.
client
(
'
s3
'
)
s3_client
=
boto3
.
client
(
'
s3
'
)
## Bucket name where documents are stored
BUCKET_NAME
=
"
cloud-lecture-2023
"
## Bedrock client
## Bedrock client
bedrock_client
=
boto3
.
client
(
service_name
=
"
bedrock-runtime
"
)
bedrock_client
=
boto3
.
client
(
service_name
=
"
bedrock-runtime
"
)
## Configuration for AWS authentication and OpenSearch client
## Configuration for AWS authentication and OpenSearch client
credentials
=
boto3
.
Session
().
get_credentials
()
credentials
=
boto3
.
Session
(
profile_name
=
'
master-group-14
'
).
get_credentials
()
awsauth
=
AWSV4SignerAuth
(
credentials
,
'
us-east-1
'
,
'
aoss
'
)
awsauth
=
AWSV4SignerAuth
(
credentials
,
'
us-east-1
'
,
'
aoss
'
)
## Vector DB endpoint
host
=
'
j6phg34iv0f2rlvxwawd.us-east-1.aoss.amazonaws.com
'
## Opensearch Client
OpenSearch_client
=
OpenSearch
(
hosts
=
[{
'
host
'
:
host
,
'
port
'
:
443
}],
http_auth
=
awsauth
,
use_ssl
=
True
,
verify_certs
=
True
,
connection_class
=
RequestsHttpConnection
,
)
## Create Index in Opensearch
## Create Index in Opensearch
def
create_index
(
index_name
):
def
create_index
(
client
,
index_name
):
indexBody
=
{
indexBody
=
{
"
settings
"
:
{
"
settings
"
:
{
"
index.knn
"
:
True
"
index.knn
"
:
True
...
@@ -62,7 +51,7 @@ def create_index(index_name):
...
@@ -62,7 +51,7 @@ def create_index(index_name):
}
}
try
:
try
:
create_response
=
OpenSearch_
client
.
indices
.
create
(
index_name
,
body
=
indexBody
)
create_response
=
client
.
indices
.
create
(
index_name
,
body
=
indexBody
)
print
(
'
\n
Creating index:
'
)
print
(
'
\n
Creating index:
'
)
print
(
create_response
)
print
(
create_response
)
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -101,6 +90,7 @@ def generate_embeddings(bedrock_client, chunks):
...
@@ -101,6 +90,7 @@ def generate_embeddings(bedrock_client, chunks):
# Store generated embeddings into an OpenSearch index.
# Store generated embeddings into an OpenSearch index.
def
store_embeddings
(
embeddings
,
texts
,
meta_data
,
host
,
awsauth
,
index_name
):
def
store_embeddings
(
embeddings
,
texts
,
meta_data
,
host
,
awsauth
,
index_name
):
docsearch
=
OpenSearchVectorSearch
.
from_embeddings
(
docsearch
=
OpenSearchVectorSearch
.
from_embeddings
(
embeddings
,
embeddings
,
texts
,
texts
,
...
@@ -137,14 +127,25 @@ def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name):
...
@@ -137,14 +127,25 @@ def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name):
## main
## main
def
main
():
def
main
(
bucket_name
,
endpoint
,
index_name
):
## Opensearch Client
OpenSearch_client
=
OpenSearch
(
hosts
=
[{
'
host
'
:
endpoint
,
'
port
'
:
443
}],
http_auth
=
awsauth
,
use_ssl
=
True
,
verify_certs
=
True
,
connection_class
=
RequestsHttpConnection
,
)
download_documents
(
BUCKET_NAME
,
LOCAL_DIR
)
download_documents
(
bucket_name
,
LOCAL_DIR
)
loader
=
PyPDFDirectoryLoader
(
LOCAL_DIR
)
loader
=
PyPDFDirectoryLoader
(
LOCAL_DIR
)
docs
=
loader
.
load
()
docs
=
loader
.
load
()
print
(
docs
[
1
])
print
(
docs
[
1
])
chunks
=
split_text
(
docs
,
1000
,
100
)
chunks
=
split_text
(
docs
,
1000
,
100
)
print
(
chunks
[
1
])
print
(
chunks
[
1
])
create_index
(
OpenSearch_client
,
index_name
)
embeddings
=
generate_embeddings
(
bedrock_client
,
chunks
)
embeddings
=
generate_embeddings
(
bedrock_client
,
chunks
)
print
(
embeddings
[
1
])
print
(
embeddings
[
1
])
texts
=
[
chunk
.
page_content
for
chunk
in
chunks
]
texts
=
[
chunk
.
page_content
for
chunk
in
chunks
]
...
@@ -152,7 +153,7 @@ def main():
...
@@ -152,7 +153,7 @@ def main():
meta_data
=
[{
'
source
'
:
chunk
.
metadata
[
'
source
'
],
'
page
'
:
chunk
.
metadata
[
'
page
'
]
+
1
}
for
chunk
in
chunks
]
meta_data
=
[{
'
source
'
:
chunk
.
metadata
[
'
source
'
],
'
page
'
:
chunk
.
metadata
[
'
page
'
]
+
1
}
for
chunk
in
chunks
]
print
(
embeddings
[
1
])
print
(
embeddings
[
1
])
print
(
meta_data
[
1
])
print
(
meta_data
[
1
])
store_embeddings
(
embeddings
,
texts
,
meta_data
,
hos
t
,
awsauth
,
index_name
)
store_embeddings
(
embeddings
,
texts
,
meta_data
,
endpoin
t
,
awsauth
,
index_name
)
...
@@ -163,4 +164,9 @@ def main():
...
@@ -163,4 +164,9 @@ def main():
if
__name__
==
"
__main__
"
:
if
__name__
==
"
__main__
"
:
main
()
parser
=
argparse
.
ArgumentParser
(
description
=
"
Process PDF documents and store their embeddings.
"
)
parser
.
add_argument
(
"
bucket_name
"
,
help
=
"
The S3 bucket name where documents are stored
"
)
parser
.
add_argument
(
"
endpoint
"
,
help
=
"
The OpenSearch service endpoint
"
)
parser
.
add_argument
(
"
index_name
"
,
help
=
"
The name of the OpenSearch index
"
)
args
=
parser
.
parse_args
()
main
(
args
.
bucket_name
,
args
.
endpoint
,
args
.
index_name
)
This diff is collapsed.
Click to expand it.
Part 2/main.py
+
1
−
0
View file @
a5728da8
...
@@ -107,6 +107,7 @@ def main():
...
@@ -107,6 +107,7 @@ def main():
st
.
session_state
.
chat_history
.
append
({
"
role
"
:
"
user
"
,
"
content
"
:
user_prompt
})
st
.
session_state
.
chat_history
.
append
({
"
role
"
:
"
user
"
,
"
content
"
:
user_prompt
})
# Generate and display answer
# Generate and display answer
print
(
user_prompt
)
print
(
user_prompt
)
embed_question
=
get_embedding
(
user_prompt
,
bedrock_client
)
embed_question
=
get_embedding
(
user_prompt
,
bedrock_client
)
print
(
embed_question
)
print
(
embed_question
)
sim_results
=
similarity_search
(
embed_question
,
index_name
)
sim_results
=
similarity_search
(
embed_question
,
index_name
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment