embedding

ca661662 · Leo Pellandini · 8cb864ab · ca661662 · ca661662 · ca661662
Commit ca661662 authored 8 months ago by Leo Pellandini
--- a/Labo 1 2024.pdf
+++ b/Labo 1 2024.pdf
--- a/Part1 - Google/pip
+++ b/Part1 - Google/pip
--- a/Part1 - Google/service-account-file.json
+++ b/Part1 - Google/service-account-file.json
+{
+  "type": "service_account",
+  "project_id": "mse-test-project-436514",
+  "private_key_id": "2645e2680f535ae1246844ac7ecca7e6c1212fd6",
+  "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDP72/azCfcLcj/\nh0IvTILjDUBEmAXTKcfm4q05WSko0ReS4t9qDA8WOpxZSQjOqj9R+QDJ7Z72iLIY\n0Z7ers5x9y6lZ227gP0IIur+gWsV003IDHdQvEK3X1rI4bX5XbUgAYsb0R9am7DN\n0cdiu3RFrf7/MQUbRkq1zR7ahDbgzTt3S6Bl/CifLJ2rstf5g3qMNPcC0KbnUqsc\n2utcJAFSa0EfX2KUeSOYsB3isA5GagEZkXueYqVIIxtegWi2qHWuzigeCIlCgrdj\nZAlxhR6RdsB03MAOsJF6B0Z4SlGZzPHB91KtMnEcvMduvTciSSFKXuM2YwqZg7VX\nTX0hUoOvAgMBAAECggEAFSCsCdxClJGmYahZpjqs9esLw6x6swkEwcX4eeJAV2Bf\nd9JLhxfwlH6iEkRke7M6udaGKP5W74/WIMVz0gaK/XNzLuVCdhHWI/SAUhnOSqps\ntc3mdbKbSMyMieq9Nbg6xiTCALKP8AHvxgnxq2uGlennBgDyFuJehvhvkR5sAQ1K\ngStlVbnejW8ZNRFrjkbaP1G9op2CacLrU/5S+Okr6AFcKFh5QmGiLESMiihJuuGZ\npvfMkNzrrA9K70g94twt06vEU2SiGHdBQ2cGUVZYXcsI+Avbqq+/pfj3WxfwXqqh\nDx/HzhiUmEPjE5exa0ArnwxuAeUBILqhMhTeNpfnWQKBgQDo6UDyu6Xvm9THjb5w\nSAiOCjZaGvCkTQZaedl2JWBtNO7H3W8Vccoll32HpHG7L6mIeLP9I2Lk+AUZOWhU\nlQLHy9ofToAs9ZSZpSyTAg1HKK/REMiU9eOez2yEQ5iWqKYXv79OJpyXM06uSx5/\nyz8T9ZQxz9qFzdMiiPbuWMVIAwKBgQDkjFqfeYsSolLGimuO4Sx6Pre5ObearXgP\noYUNwGODdkg4wm7zpJc2XiDBlL/iyW2Gyt4M2jTmJI+wKOWsGPTPOTMBk7cNLbMx\nDiGPaQXAG1XDtxYj2TKojoRBkbfJX63NI6vkKRL/vzMmbCJ2y1lKX0j65LTrwm8b\nGhIdn9Wz5QKBgQCFYYbjOxkFBe2ttfu4W1gi17BWm5Tx0nZv+9XQNglpoOWZqbLC\nyh5ktsOZmU/UTbA9yjnxHoG09GAfGOQphAhKmPA5+3+lv6Gw94l2SreF58P/6yej\nPslymgDgIcHRjZVIhnOs8qm8YRKO98/oiWF/MaUDfa/77moaHeujhUy9NwKBgQCM\nswNPTioZ7Kh85dZVfbY+A8JjW2724HgbV1psHtakpfrMRpa7k8YriEMuKX8ABPVS\nmC2fR+5tCHEVB/hsvGhp8lK+U8vLZyj7uDFc8lDB9ZIVDO+qXhpbvnEZVLYKWMbM\nlXtK2SaDH5hDvSpya7mqmYJ6QrZGtcpkquYgKrgLKQKBgQDmooLfchORwvl0szmB\nXkpz1B52UT860cIVnfvatm6ImPqwSPGrDKJDgpbeoDaMKf2Z/pmLxWtFIzJQRXew\n53U1d2diEGprBzUhQUBQju1bLcpQkPYyVov7ZYudahOijt8pj35Zz0HsyFkDYQvv\nnRn2cosZM+uzYP9QlVgGIAS2Ig==\n-----END PRIVATE KEY-----\n",
+  "client_email": "vertexai@mse-test-project-436514.iam.gserviceaccount.com",
+  "client_id": "103535310171085862136",
+  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+  "token_uri": "https://oauth2.googleapis.com/token",
+  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/vertexai%40mse-test-project-436514.iam.gserviceaccount.com",
+  "universe_domain": "googleapis.com"
+}
--- a/Part1 - Google/vectorise-store.py
+++ b/Part1 - Google/vectorise-store.py
-# Creator: Abir Chebbi (abir.chebbi@hesge.ch)
-
-import boto3
 import os
+import argparse
+from google.cloud import storage
+from google.cloud import aiplatform
 from langchain_community.document_loaders import PyPDFDirectoryLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import BedrockEmbeddings
-from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
-from langchain_community.vectorstores import OpenSearchVectorSearch
-import argparse
-
-
-
-
-## S3_client
-s3_client = boto3.client('s3')
-
-## Bedrock client
-bedrock_client = boto3.client(service_name="bedrock-runtime")
-

-## Configuration for AWS authentication and OpenSearch client
-credentials = boto3.Session().get_credentials()
-awsauth = AWSV4SignerAuth(credentials, 'us-east-1', 'aoss')
+# Configuration de l'authentification Google Cloud
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service-account-file.json"

+# Configuration du client Google Cloud Storage
+storage_client = storage.Client()

+# Fonction pour télécharger des documents depuis Google Cloud Storage
+def download_documents(bucket_name, local_dir):
+    bucket = storage_client.bucket(bucket_name)
+    blobs = bucket.list_blobs()
+    for blob in blobs:
+        if blob.name.endswith('.pdf'):
+            local_filename = os.path.join(local_dir, blob.name)
+            blob.download_to_filename(local_filename)
+            print(f'Downloaded {blob.name} to {local_filename}')

-
-## Create Index in Opensearch
-def create_index(client,index_name):
-    indexBody = {
-        "settings": {
-            "index.knn": True
-        },
-        "mappings": {
-            "properties": {
-                "vector_field": {
-                    "type": "knn_vector",
-                    "dimension": 1536,
-                    "method": {
-                        "engine": "faiss",
-                        "name": "hnsw"
-                    }
-                }
-            }
-        }
-    }
-
-    try:
-        create_response = client.indices.create(index_name, body=indexBody)
-        print('\nCreating index:')
-        print(create_response)
-    except Exception as e:
-        print(e)
-        print("(Index likely already exists?)") 
-
-
-
-## Load docs from S3
-def download_documents(bucket_name,local_dir):
-    response = s3_client.list_objects_v2(Bucket=bucket_name)
-    for item in response['Contents']:
-            key = item['Key']
-            if key.endswith('.pdf'):
-                local_filename = os.path.join(local_dir, key)
-                s3_client.download_file(Bucket=bucket_name, Key=key, Filename=local_filename)
-
-   
-    
-
-
-
-## Split pages/text into chunks
+# Fonction pour diviser les pages/textes en morceaux
 def split_text(docs, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-    chunks = text_splitter.split_documents(docs)   
-
+    chunks = text_splitter.split_documents(docs)
    return chunks
- 
-## Generate embeddings 
-def generate_embeddings(bedrock_client, chunks):
-    embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
-    chunks_list=[chunk.page_content for chunk in chunks]
-    embeddings = embeddings_model.embed_documents(chunks_list)
-    return embeddings
-
-# Store generated embeddings into an OpenSearch index.
-def store_embeddings(embeddings, texts, meta_data, host, awsauth, index_name):  
-    
-    docsearch = OpenSearchVectorSearch.from_embeddings(
-        embeddings,
-        texts,
-        meta_data,
-        opensearch_url=f'https://{host}:443',
-        http_auth=awsauth,
-        use_ssl=True,
-        verify_certs=True,
-        connection_class=RequestsHttpConnection,
-        index_name=index_name,
-        bulk_size=1000
-)
-
-    return docsearch

+# Fonction pour générer des embeddings
+def generate_embeddings(texts):
+    # Initialiser Vertex AI
+    aiplatform.init(project="mse-test-project-436514", location="us-central1")

-# Func to do both generating and storing embeddings
-def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name):
-    embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
-    docsearch = OpenSearchVectorSearch.from_documents(
-        chunks,
-        embeddings_model,
-        opensearch_url=f'https://{host}:443',
-        http_auth=awsauth,
-        use_ssl=True,
-        verify_certs=True,
-        connection_class=RequestsHttpConnection,
-        index_name=index_name,
-        bulk_size=1000
-)
+    # Remplacer par l'ID de votre endpoint
+    endpoint_id = "2223196018688655360"  # Remplacez par l'ID réel

-    return docsearch
+    # Obtenez l'endpoint pour générer des embeddings
+    endpoint = aiplatform.Endpoint(endpoint_id)

+    # Créez les instances avec le champ "inputs" attendu
+    instances = [{"inputs": text} for text in texts]

+    # Générez les embeddings via Vertex AI
+    response = endpoint.predict(instances=instances)

-## main 
-def main(bucket_name, endpoint,index_name, local_path):
+    # Récupérer et retourner les embeddings
+    return response.predictions

-    ## Opensearch Client
-    OpenSearch_client = OpenSearch(
-        hosts=[{'host': endpoint, 'port': 443}],
-        http_auth=awsauth,
-        use_ssl=True,
-        verify_certs=True,
-        connection_class=RequestsHttpConnection,
-        
-    )
-
-    download_documents(bucket_name,local_path)
-    loader= PyPDFDirectoryLoader(local_path)
+# Fonction principale
+def main(bucket_name, index_name, local_path):
+    download_documents(bucket_name, local_path)
+    
+    # Charger les documents
+    loader = PyPDFDirectoryLoader(local_path)
    docs = loader.load()
    print('Start chunking')
+    
    chunks = split_text(docs, 1000, 100)
-    print(chunks[1])
-    create_index(OpenSearch_client,index_name)
-    print('Start vectorising')
-    embeddings= generate_embeddings(bedrock_client, chunks)
-    print(embeddings[1])
    texts = [chunk.page_content for chunk in chunks]
-     # Prepare metadata for each chunk
-    meta_data = [{'source': chunk.metadata['source'], 'page': chunk.metadata['page'] + 1} for chunk in chunks]
-    print('Start storing')
-    store_embeddings(embeddings, texts, meta_data ,endpoint, awsauth,index_name)
-    print('End storing')
-
-
-   
-
-  
-
-
+    
+    print('Start vectorizing')
+    embeddings = generate_embeddings(texts)
+    
+    # Logique de stockage ou de traitement des embeddings
+    print('Embeddings generated:', embeddings)

+    print('End processing')

-if __name__== "__main__":
+if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process PDF documents and store their embeddings.")
-    parser.add_argument("--bucket_name", help="The S3 bucket name where documents are stored")
-    parser.add_argument("--endpoint", help="The OpenSearch service endpoint")
-    parser.add_argument("--index_name", help="The name of the OpenSearch index")
-    parser.add_argument("--local_path", help="local path")
+    parser.add_argument("--bucket_name", help="The GCS bucket name where documents are stored")
+    parser.add_argument("--index_name", help="The name of the index for storing embeddings (if applicable)")
+    parser.add_argument("--local_path", help="Local path to store downloaded files")
    args = parser.parse_args()
-    main(args.bucket_name, args.endpoint, args.index_name, args.local_path)
+    main(args.bucket_name, args.index_name, args.local_path)
--- a/Pellandini_Proxmox_final.pdf
+++ b/Pellandini_Proxmox_final.pdf