diff --git a/Part 1/main.py b/Part 1/main.py index 3a6c71710f062a44a30ef0d569eb7d2388bafdad..6438a255f7275ba1b9928703e68bae5185ecc76e 100644 --- a/Part 1/main.py +++ b/Part 1/main.py @@ -11,9 +11,6 @@ import json import argparse -## Local directory for storing PDF files -LOCAL_DIR = "pdfs" - ## S3_client @@ -127,7 +124,7 @@ def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name): ## main -def main(bucket_name, endpoint,index_name): +def main(bucket_name, endpoint,index_name, local_path): ## Opensearch Client OpenSearch_client = OpenSearch( @@ -139,8 +136,8 @@ def main(bucket_name, endpoint,index_name): ) - download_documents(bucket_name,LOCAL_DIR) - loader= PyPDFDirectoryLoader(LOCAL_DIR) + download_documents(bucket_name,local_path) + loader= PyPDFDirectoryLoader(local_path) docs = loader.load() print(docs[1]) chunks = split_text(docs, 1000, 100) @@ -168,5 +165,6 @@ if __name__== "__main__": parser.add_argument("--bucket_name", help="The S3 bucket name where documents are stored") parser.add_argument("--endpoint", help="The OpenSearch service endpoint") parser.add_argument("--index_name", help="The name of the OpenSearch index") + parser.add_argument("--local_path", help="local path") args = parser.parse_args() - main(args.bucket_name, args.endpoint, args.index_name) + main(args.bucket_name, args.endpoint, args.index_name, args.local_path) diff --git a/README.md b/README.md index ec86e12d069f1dde438fc1d239e0f5949a52836d..ebd7a7164b03e8db753ed24b1b3629557a4cb23e 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ Where: - **--bucket_name**: The name of the S3 bucket containing the PDF files. - **--endpoint**: Endpoint for the vector database. - **--index_name**: The index_name where to store the embeddings in the collection. +- **--local_dir**: The main.py script will: