Skip to content
Snippets Groups Projects
Commit ca661662 authored by Leo Pellandini's avatar Leo Pellandini
Browse files

embedding

parent 8cb864ab
No related branches found
No related tags found
No related merge requests found
File added
{
"type": "service_account",
"project_id": "mse-test-project-436514",
"private_key_id": "2645e2680f535ae1246844ac7ecca7e6c1212fd6",
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDP72/azCfcLcj/\nh0IvTILjDUBEmAXTKcfm4q05WSko0ReS4t9qDA8WOpxZSQjOqj9R+QDJ7Z72iLIY\n0Z7ers5x9y6lZ227gP0IIur+gWsV003IDHdQvEK3X1rI4bX5XbUgAYsb0R9am7DN\n0cdiu3RFrf7/MQUbRkq1zR7ahDbgzTt3S6Bl/CifLJ2rstf5g3qMNPcC0KbnUqsc\n2utcJAFSa0EfX2KUeSOYsB3isA5GagEZkXueYqVIIxtegWi2qHWuzigeCIlCgrdj\nZAlxhR6RdsB03MAOsJF6B0Z4SlGZzPHB91KtMnEcvMduvTciSSFKXuM2YwqZg7VX\nTX0hUoOvAgMBAAECggEAFSCsCdxClJGmYahZpjqs9esLw6x6swkEwcX4eeJAV2Bf\nd9JLhxfwlH6iEkRke7M6udaGKP5W74/WIMVz0gaK/XNzLuVCdhHWI/SAUhnOSqps\ntc3mdbKbSMyMieq9Nbg6xiTCALKP8AHvxgnxq2uGlennBgDyFuJehvhvkR5sAQ1K\ngStlVbnejW8ZNRFrjkbaP1G9op2CacLrU/5S+Okr6AFcKFh5QmGiLESMiihJuuGZ\npvfMkNzrrA9K70g94twt06vEU2SiGHdBQ2cGUVZYXcsI+Avbqq+/pfj3WxfwXqqh\nDx/HzhiUmEPjE5exa0ArnwxuAeUBILqhMhTeNpfnWQKBgQDo6UDyu6Xvm9THjb5w\nSAiOCjZaGvCkTQZaedl2JWBtNO7H3W8Vccoll32HpHG7L6mIeLP9I2Lk+AUZOWhU\nlQLHy9ofToAs9ZSZpSyTAg1HKK/REMiU9eOez2yEQ5iWqKYXv79OJpyXM06uSx5/\nyz8T9ZQxz9qFzdMiiPbuWMVIAwKBgQDkjFqfeYsSolLGimuO4Sx6Pre5ObearXgP\noYUNwGODdkg4wm7zpJc2XiDBlL/iyW2Gyt4M2jTmJI+wKOWsGPTPOTMBk7cNLbMx\nDiGPaQXAG1XDtxYj2TKojoRBkbfJX63NI6vkKRL/vzMmbCJ2y1lKX0j65LTrwm8b\nGhIdn9Wz5QKBgQCFYYbjOxkFBe2ttfu4W1gi17BWm5Tx0nZv+9XQNglpoOWZqbLC\nyh5ktsOZmU/UTbA9yjnxHoG09GAfGOQphAhKmPA5+3+lv6Gw94l2SreF58P/6yej\nPslymgDgIcHRjZVIhnOs8qm8YRKO98/oiWF/MaUDfa/77moaHeujhUy9NwKBgQCM\nswNPTioZ7Kh85dZVfbY+A8JjW2724HgbV1psHtakpfrMRpa7k8YriEMuKX8ABPVS\nmC2fR+5tCHEVB/hsvGhp8lK+U8vLZyj7uDFc8lDB9ZIVDO+qXhpbvnEZVLYKWMbM\nlXtK2SaDH5hDvSpya7mqmYJ6QrZGtcpkquYgKrgLKQKBgQDmooLfchORwvl0szmB\nXkpz1B52UT860cIVnfvatm6ImPqwSPGrDKJDgpbeoDaMKf2Z/pmLxWtFIzJQRXew\n53U1d2diEGprBzUhQUBQju1bLcpQkPYyVov7ZYudahOijt8pj35Zz0HsyFkDYQvv\nnRn2cosZM+uzYP9QlVgGIAS2Ig==\n-----END PRIVATE KEY-----\n",
"client_email": "vertexai@mse-test-project-436514.iam.gserviceaccount.com",
"client_id": "103535310171085862136",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://oauth2.googleapis.com/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/vertexai%40mse-test-project-436514.iam.gserviceaccount.com",
"universe_domain": "googleapis.com"
}
# Creator: Abir Chebbi (abir.chebbi@hesge.ch)
import boto3
import os
import argparse
from google.cloud import storage
from google.cloud import aiplatform
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import BedrockEmbeddings
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from langchain_community.vectorstores import OpenSearchVectorSearch
import argparse
## S3_client
s3_client = boto3.client('s3')
## Bedrock client
bedrock_client = boto3.client(service_name="bedrock-runtime")
## Configuration for AWS authentication and OpenSearch client
credentials = boto3.Session().get_credentials()
awsauth = AWSV4SignerAuth(credentials, 'us-east-1', 'aoss')
# Configuration de l'authentification Google Cloud
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service-account-file.json"
# Configuration du client Google Cloud Storage
storage_client = storage.Client()
# Fonction pour télécharger des documents depuis Google Cloud Storage
def download_documents(bucket_name, local_dir):
bucket = storage_client.bucket(bucket_name)
blobs = bucket.list_blobs()
for blob in blobs:
if blob.name.endswith('.pdf'):
local_filename = os.path.join(local_dir, blob.name)
blob.download_to_filename(local_filename)
print(f'Downloaded {blob.name} to {local_filename}')
## Create Index in Opensearch
def create_index(client,index_name):
indexBody = {
"settings": {
"index.knn": True
},
"mappings": {
"properties": {
"vector_field": {
"type": "knn_vector",
"dimension": 1536,
"method": {
"engine": "faiss",
"name": "hnsw"
}
}
}
}
}
try:
create_response = client.indices.create(index_name, body=indexBody)
print('\nCreating index:')
print(create_response)
except Exception as e:
print(e)
print("(Index likely already exists?)")
## Load docs from S3
def download_documents(bucket_name,local_dir):
response = s3_client.list_objects_v2(Bucket=bucket_name)
for item in response['Contents']:
key = item['Key']
if key.endswith('.pdf'):
local_filename = os.path.join(local_dir, key)
s3_client.download_file(Bucket=bucket_name, Key=key, Filename=local_filename)
## Split pages/text into chunks
# Fonction pour diviser les pages/textes en morceaux
def split_text(docs, chunk_size, chunk_overlap):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunks = text_splitter.split_documents(docs)
chunks = text_splitter.split_documents(docs)
return chunks
## Generate embeddings
def generate_embeddings(bedrock_client, chunks):
embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
chunks_list=[chunk.page_content for chunk in chunks]
embeddings = embeddings_model.embed_documents(chunks_list)
return embeddings
# Store generated embeddings into an OpenSearch index.
def store_embeddings(embeddings, texts, meta_data, host, awsauth, index_name):
docsearch = OpenSearchVectorSearch.from_embeddings(
embeddings,
texts,
meta_data,
opensearch_url=f'https://{host}:443',
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection,
index_name=index_name,
bulk_size=1000
)
return docsearch
# Fonction pour générer des embeddings
def generate_embeddings(texts):
# Initialiser Vertex AI
aiplatform.init(project="mse-test-project-436514", location="us-central1")
# Func to do both generating and storing embeddings
def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name):
embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
docsearch = OpenSearchVectorSearch.from_documents(
chunks,
embeddings_model,
opensearch_url=f'https://{host}:443',
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection,
index_name=index_name,
bulk_size=1000
)
# Remplacer par l'ID de votre endpoint
endpoint_id = "2223196018688655360" # Remplacez par l'ID réel
return docsearch
# Obtenez l'endpoint pour générer des embeddings
endpoint = aiplatform.Endpoint(endpoint_id)
# Créez les instances avec le champ "inputs" attendu
instances = [{"inputs": text} for text in texts]
# Générez les embeddings via Vertex AI
response = endpoint.predict(instances=instances)
## main
def main(bucket_name, endpoint,index_name, local_path):
# Récupérer et retourner les embeddings
return response.predictions
## Opensearch Client
OpenSearch_client = OpenSearch(
hosts=[{'host': endpoint, 'port': 443}],
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection,
)
download_documents(bucket_name,local_path)
loader= PyPDFDirectoryLoader(local_path)
# Fonction principale
def main(bucket_name, index_name, local_path):
download_documents(bucket_name, local_path)
# Charger les documents
loader = PyPDFDirectoryLoader(local_path)
docs = loader.load()
print('Start chunking')
chunks = split_text(docs, 1000, 100)
print(chunks[1])
create_index(OpenSearch_client,index_name)
print('Start vectorising')
embeddings= generate_embeddings(bedrock_client, chunks)
print(embeddings[1])
texts = [chunk.page_content for chunk in chunks]
# Prepare metadata for each chunk
meta_data = [{'source': chunk.metadata['source'], 'page': chunk.metadata['page'] + 1} for chunk in chunks]
print('Start storing')
store_embeddings(embeddings, texts, meta_data ,endpoint, awsauth,index_name)
print('End storing')
print('Start vectorizing')
embeddings = generate_embeddings(texts)
# Logique de stockage ou de traitement des embeddings
print('Embeddings generated:', embeddings)
print('End processing')
if __name__== "__main__":
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process PDF documents and store their embeddings.")
parser.add_argument("--bucket_name", help="The S3 bucket name where documents are stored")
parser.add_argument("--endpoint", help="The OpenSearch service endpoint")
parser.add_argument("--index_name", help="The name of the OpenSearch index")
parser.add_argument("--local_path", help="local path")
parser.add_argument("--bucket_name", help="The GCS bucket name where documents are stored")
parser.add_argument("--index_name", help="The name of the index for storing embeddings (if applicable)")
parser.add_argument("--local_path", help="Local path to store downloaded files")
args = parser.parse_args()
main(args.bucket_name, args.endpoint, args.index_name, args.local_path)
main(args.bucket_name, args.index_name, args.local_path)
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment