diff --git a/Labo 1 2024.pdf b/Labo 1 2024.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3f057ac5eca20e92abc4b9535e14ce734698992e Binary files /dev/null and b/Labo 1 2024.pdf differ diff --git a/Part1 - Google/pip b/Part1 - Google/pip new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Part1 - Google/service-account-file.json b/Part1 - Google/service-account-file.json new file mode 100644 index 0000000000000000000000000000000000000000..0a3ef7db7b71b75b66a37b7e46e6e43d32d09fa0 --- /dev/null +++ b/Part1 - Google/service-account-file.json @@ -0,0 +1,13 @@ +{ + "type": "service_account", + "project_id": "mse-test-project-436514", + "private_key_id": "2645e2680f535ae1246844ac7ecca7e6c1212fd6", + "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDP72/azCfcLcj/\nh0IvTILjDUBEmAXTKcfm4q05WSko0ReS4t9qDA8WOpxZSQjOqj9R+QDJ7Z72iLIY\n0Z7ers5x9y6lZ227gP0IIur+gWsV003IDHdQvEK3X1rI4bX5XbUgAYsb0R9am7DN\n0cdiu3RFrf7/MQUbRkq1zR7ahDbgzTt3S6Bl/CifLJ2rstf5g3qMNPcC0KbnUqsc\n2utcJAFSa0EfX2KUeSOYsB3isA5GagEZkXueYqVIIxtegWi2qHWuzigeCIlCgrdj\nZAlxhR6RdsB03MAOsJF6B0Z4SlGZzPHB91KtMnEcvMduvTciSSFKXuM2YwqZg7VX\nTX0hUoOvAgMBAAECggEAFSCsCdxClJGmYahZpjqs9esLw6x6swkEwcX4eeJAV2Bf\nd9JLhxfwlH6iEkRke7M6udaGKP5W74/WIMVz0gaK/XNzLuVCdhHWI/SAUhnOSqps\ntc3mdbKbSMyMieq9Nbg6xiTCALKP8AHvxgnxq2uGlennBgDyFuJehvhvkR5sAQ1K\ngStlVbnejW8ZNRFrjkbaP1G9op2CacLrU/5S+Okr6AFcKFh5QmGiLESMiihJuuGZ\npvfMkNzrrA9K70g94twt06vEU2SiGHdBQ2cGUVZYXcsI+Avbqq+/pfj3WxfwXqqh\nDx/HzhiUmEPjE5exa0ArnwxuAeUBILqhMhTeNpfnWQKBgQDo6UDyu6Xvm9THjb5w\nSAiOCjZaGvCkTQZaedl2JWBtNO7H3W8Vccoll32HpHG7L6mIeLP9I2Lk+AUZOWhU\nlQLHy9ofToAs9ZSZpSyTAg1HKK/REMiU9eOez2yEQ5iWqKYXv79OJpyXM06uSx5/\nyz8T9ZQxz9qFzdMiiPbuWMVIAwKBgQDkjFqfeYsSolLGimuO4Sx6Pre5ObearXgP\noYUNwGODdkg4wm7zpJc2XiDBlL/iyW2Gyt4M2jTmJI+wKOWsGPTPOTMBk7cNLbMx\nDiGPaQXAG1XDtxYj2TKojoRBkbfJX63NI6vkKRL/vzMmbCJ2y1lKX0j65LTrwm8b\nGhIdn9Wz5QKBgQCFYYbjOxkFBe2ttfu4W1gi17BWm5Tx0nZv+9XQNglpoOWZqbLC\nyh5ktsOZmU/UTbA9yjnxHoG09GAfGOQphAhKmPA5+3+lv6Gw94l2SreF58P/6yej\nPslymgDgIcHRjZVIhnOs8qm8YRKO98/oiWF/MaUDfa/77moaHeujhUy9NwKBgQCM\nswNPTioZ7Kh85dZVfbY+A8JjW2724HgbV1psHtakpfrMRpa7k8YriEMuKX8ABPVS\nmC2fR+5tCHEVB/hsvGhp8lK+U8vLZyj7uDFc8lDB9ZIVDO+qXhpbvnEZVLYKWMbM\nlXtK2SaDH5hDvSpya7mqmYJ6QrZGtcpkquYgKrgLKQKBgQDmooLfchORwvl0szmB\nXkpz1B52UT860cIVnfvatm6ImPqwSPGrDKJDgpbeoDaMKf2Z/pmLxWtFIzJQRXew\n53U1d2diEGprBzUhQUBQju1bLcpQkPYyVov7ZYudahOijt8pj35Zz0HsyFkDYQvv\nnRn2cosZM+uzYP9QlVgGIAS2Ig==\n-----END PRIVATE KEY-----\n", + "client_email": "vertexai@mse-test-project-436514.iam.gserviceaccount.com", + "client_id": "103535310171085862136", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/vertexai%40mse-test-project-436514.iam.gserviceaccount.com", + "universe_domain": "googleapis.com" +} diff --git a/Part1 - Google/vectorise-store.py b/Part1 - Google/vectorise-store.py index 3e5eda49610b0325c13770b2c468dbff5ba3f2c3..77a4b8cb71c274383a6a15d40d6c46a9c04ff68c 100644 --- a/Part1 - Google/vectorise-store.py +++ b/Part1 - Google/vectorise-store.py @@ -1,170 +1,76 @@ -# Creator: Abir Chebbi (abir.chebbi@hesge.ch) - -import boto3 import os +import argparse +from google.cloud import storage +from google.cloud import aiplatform from langchain_community.document_loaders import PyPDFDirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.embeddings import BedrockEmbeddings -from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth -from langchain_community.vectorstores import OpenSearchVectorSearch -import argparse - - - - -## S3_client -s3_client = boto3.client('s3') - -## Bedrock client -bedrock_client = boto3.client(service_name="bedrock-runtime") - -## Configuration for AWS authentication and OpenSearch client -credentials = boto3.Session().get_credentials() -awsauth = AWSV4SignerAuth(credentials, 'us-east-1', 'aoss') +# Configuration de l'authentification Google Cloud +os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service-account-file.json" +# Configuration du client Google Cloud Storage +storage_client = storage.Client() +# Fonction pour télécharger des documents depuis Google Cloud Storage +def download_documents(bucket_name, local_dir): + bucket = storage_client.bucket(bucket_name) + blobs = bucket.list_blobs() + for blob in blobs: + if blob.name.endswith('.pdf'): + local_filename = os.path.join(local_dir, blob.name) + blob.download_to_filename(local_filename) + print(f'Downloaded {blob.name} to {local_filename}') - -## Create Index in Opensearch -def create_index(client,index_name): - indexBody = { - "settings": { - "index.knn": True - }, - "mappings": { - "properties": { - "vector_field": { - "type": "knn_vector", - "dimension": 1536, - "method": { - "engine": "faiss", - "name": "hnsw" - } - } - } - } - } - - try: - create_response = client.indices.create(index_name, body=indexBody) - print('\nCreating index:') - print(create_response) - except Exception as e: - print(e) - print("(Index likely already exists?)") - - - -## Load docs from S3 -def download_documents(bucket_name,local_dir): - response = s3_client.list_objects_v2(Bucket=bucket_name) - for item in response['Contents']: - key = item['Key'] - if key.endswith('.pdf'): - local_filename = os.path.join(local_dir, key) - s3_client.download_file(Bucket=bucket_name, Key=key, Filename=local_filename) - - - - - - -## Split pages/text into chunks +# Fonction pour diviser les pages/textes en morceaux def split_text(docs, chunk_size, chunk_overlap): text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) - chunks = text_splitter.split_documents(docs) - + chunks = text_splitter.split_documents(docs) return chunks - -## Generate embeddings -def generate_embeddings(bedrock_client, chunks): - embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client) - chunks_list=[chunk.page_content for chunk in chunks] - embeddings = embeddings_model.embed_documents(chunks_list) - return embeddings - -# Store generated embeddings into an OpenSearch index. -def store_embeddings(embeddings, texts, meta_data, host, awsauth, index_name): - - docsearch = OpenSearchVectorSearch.from_embeddings( - embeddings, - texts, - meta_data, - opensearch_url=f'https://{host}:443', - http_auth=awsauth, - use_ssl=True, - verify_certs=True, - connection_class=RequestsHttpConnection, - index_name=index_name, - bulk_size=1000 -) - - return docsearch +# Fonction pour générer des embeddings +def generate_embeddings(texts): + # Initialiser Vertex AI + aiplatform.init(project="mse-test-project-436514", location="us-central1") -# Func to do both generating and storing embeddings -def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name): - embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client) - docsearch = OpenSearchVectorSearch.from_documents( - chunks, - embeddings_model, - opensearch_url=f'https://{host}:443', - http_auth=awsauth, - use_ssl=True, - verify_certs=True, - connection_class=RequestsHttpConnection, - index_name=index_name, - bulk_size=1000 -) + # Remplacer par l'ID de votre endpoint + endpoint_id = "2223196018688655360" # Remplacez par l'ID réel - return docsearch + # Obtenez l'endpoint pour générer des embeddings + endpoint = aiplatform.Endpoint(endpoint_id) + # Créez les instances avec le champ "inputs" attendu + instances = [{"inputs": text} for text in texts] + # Générez les embeddings via Vertex AI + response = endpoint.predict(instances=instances) -## main -def main(bucket_name, endpoint,index_name, local_path): + # Récupérer et retourner les embeddings + return response.predictions - ## Opensearch Client - OpenSearch_client = OpenSearch( - hosts=[{'host': endpoint, 'port': 443}], - http_auth=awsauth, - use_ssl=True, - verify_certs=True, - connection_class=RequestsHttpConnection, - - ) - - download_documents(bucket_name,local_path) - loader= PyPDFDirectoryLoader(local_path) +# Fonction principale +def main(bucket_name, index_name, local_path): + download_documents(bucket_name, local_path) + + # Charger les documents + loader = PyPDFDirectoryLoader(local_path) docs = loader.load() print('Start chunking') + chunks = split_text(docs, 1000, 100) - print(chunks[1]) - create_index(OpenSearch_client,index_name) - print('Start vectorising') - embeddings= generate_embeddings(bedrock_client, chunks) - print(embeddings[1]) texts = [chunk.page_content for chunk in chunks] - # Prepare metadata for each chunk - meta_data = [{'source': chunk.metadata['source'], 'page': chunk.metadata['page'] + 1} for chunk in chunks] - print('Start storing') - store_embeddings(embeddings, texts, meta_data ,endpoint, awsauth,index_name) - print('End storing') - - - - - - - + + print('Start vectorizing') + embeddings = generate_embeddings(texts) + + # Logique de stockage ou de traitement des embeddings + print('Embeddings generated:', embeddings) + print('End processing') -if __name__== "__main__": +if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process PDF documents and store their embeddings.") - parser.add_argument("--bucket_name", help="The S3 bucket name where documents are stored") - parser.add_argument("--endpoint", help="The OpenSearch service endpoint") - parser.add_argument("--index_name", help="The name of the OpenSearch index") - parser.add_argument("--local_path", help="local path") + parser.add_argument("--bucket_name", help="The GCS bucket name where documents are stored") + parser.add_argument("--index_name", help="The name of the index for storing embeddings (if applicable)") + parser.add_argument("--local_path", help="Local path to store downloaded files") args = parser.parse_args() - main(args.bucket_name, args.endpoint, args.index_name, args.local_path) + main(args.bucket_name, args.index_name, args.local_path) diff --git a/Pellandini_Proxmox_final.pdf b/Pellandini_Proxmox_final.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bb172a12b61d610c9ef80c382d5bebb79ca13ac3 Binary files /dev/null and b/Pellandini_Proxmox_final.pdf differ