From 457c20a834a2f6141382e1335e32556e6360a2ca Mon Sep 17 00:00:00 2001
From: Durvesh Rajubhau Mahurkar <durvesh.mahurkar@niveussolutions.com>
Date: Tue, 1 Oct 2024 03:57:42 +0000
Subject: [PATCH] Upload New File

---
 GenAI_Case_Study.ipynb | 396 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 396 insertions(+)
 create mode 100644 GenAI_Case_Study.ipynb

diff --git a/GenAI_Case_Study.ipynb b/GenAI_Case_Study.ipynb
new file mode 100644
index 0000000..e64dd4e
--- /dev/null
+++ b/GenAI_Case_Study.ipynb
@@ -0,0 +1,396 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "3c146709-280d-4b17-abd2-dd4a3114156d",
+   "metadata": {},
+   "source": [
+    "<b>Case Study:</b> Imagine you are working for GenAI, a cutting-edge AI company that specializes in natural language processing and document management. Your task is to create a coding exercise that demonstrates the integration of multiple PDF and Excel documents using vector database text embedding. Additionally, you will leverage LangChain for text summarization."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fd7d033a-2d71-4ddb-8903-d514d73c0b34",
+   "metadata": {},
+   "source": [
+    "**ContractNLI** is a dataset for document-level natural language inference (NLI) on contracts whose goal is to automate/support a time-consuming procedure of contract review. In this task, a system is given a set of hypotheses (such as “Some obligations of Agreement may survive termination.”) and a contract, and it is asked to classify whether each hypothesis is entailed by, contradicting to or not mentioned by (neutral to) the contract as well as identifying evidence for the decision as spans in the contract."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "401d2079-1b79-4fab-bc43-ba8c43830170",
+   "metadata": {},
+   "source": [
+    " **Extract Text from Multiple PDFs**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "4696d663-8e7b-49d7-b087-aa011490f654",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.8.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import fitz \n",
+    "import json\n",
+    "import requests\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "import faiss\n",
+    "print(faiss.__version__)\n",
+    "import numpy as np\n",
+    "from dotenv import load_dotenv \n",
+    "load_dotenv() \n",
+    "from typing import Optional, List\n",
+    "from transformers import pipeline\n",
+    "from langchain_google_genai import ChatGoogleGenerativeAI\n",
+    "from langchain_google_genai import GoogleGenerativeAIEmbeddings\n",
+    "from langchain.chains import StuffDocumentsChain\n",
+    "from langchain.chains.llm import LLMChain\n",
+    "from langchain import HuggingFaceHub\n",
+    "from langchain.llms.base import BaseLLM\n",
+    "from langchain.prompts import PromptTemplate\n",
+    "from langchain.schema import LLMResult, Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "27c80419-1535-4494-aebf-cdcc2f09faaa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_text_from_pdfs(directory):\n",
+    "    pdf_texts = {}\n",
+    "    for filename in os.listdir(directory):\n",
+    "        if filename.endswith(\".pdf\"):\n",
+    "            pdf_path = os.path.join(directory, filename)\n",
+    "            doc = fitz.open(pdf_path)\n",
+    "            text = \"\"\n",
+    "            for page in doc:\n",
+    "                text += page.get_text()\n",
+    "            pdf_texts[filename] = text\n",
+    "    return pdf_texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "9ef8503e-74af-46c6-85c4-f90739629ee6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_text_from_json(json_path):\n",
+    "    with open(json_path, \"r\") as file:\n",
+    "        data = json.load(file)\n",
+    "    return json.dumps(data, indent=4)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "308f1949-a628-4071-b160-2c5abbe2a7a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdf_texts = extract_text_from_pdfs(r\"C:\\Users\\acer\\Documents\\contract-nli\\contract-nli\\raw\")\n",
+    "json_train_text = extract_text_from_json(r\"C:\\Users\\acer\\Documents\\contract-nli\\contract-nli\\train.json\")\n",
+    "json_test_text = extract_text_from_json(r\"C:\\Users\\acer\\Documents\\contract-nli\\contract-nli\\test.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "01de7d7d-233f-4f31-8111-a1f4d9ca2502",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#pdf_texts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6a96e9a3-b5d5-4d8c-8d7f-972c3fe79610",
+   "metadata": {},
+   "source": [
+    "**Vectorize the Extracted Text**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "b1e8117d-2f7d-4bce-a467-ce114bf1e693",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\acer\\AppData\\Roaming\\Python\\Python311\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load the SentenceTransformer model\n",
+    "model = SentenceTransformer('all-MiniLM-L6-v2')\n",
+    "\n",
+    "# Generate embeddings for multiple texts (from PDF and JSON)\n",
+    "def generate_embeddings(texts):\n",
+    "    embeddings = {}\n",
+    "    for filename, text in texts.items():\n",
+    "        embedding = model.encode(text)\n",
+    "        embeddings[filename] = embedding\n",
+    "    return embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "802ff79c-da02-4a3f-b1ab-965e8b64a537",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If json_train_text is a single string, wrap it in a dictionary\n",
+    "json_train_text = {\"train_file.json\": json_train_text}\n",
+    "json_test_text = {\"test_file.json\": json_test_text}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "6c30a3d4-843c-484e-bf09-2e11a81db4ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate embeddings\n",
+    "pdf_embeddings = generate_embeddings(pdf_texts)\n",
+    "# Now generate embeddings\n",
+    "json_train_embeddings = generate_embeddings(json_train_text)\n",
+    "json_test_embeddings = generate_embeddings(json_test_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "89a47114-be32-4d0b-95e9-6e2bffab269a",
+   "metadata": {},
+   "source": [
+    "**Store Embeddings in a Vector Database**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "38aa7076-12cc-4ca7-adc0-653967650ea0",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Non-Disclosure%20Agreement_5.pdf\n",
+      "Basic-Non-Disclosure-Agreement.pdf\n",
+      "NDA-Agreement-NPAF.pdf\n",
+      "NDA_6.pdf\n",
+      "Non-Disclosure-Agreement-NDA.pdf\n"
+     ]
+    }
+   ],
+   "source": [
+    "embedding_list = np.array(list(pdf_embeddings.values())).astype('float32')\n",
+    "# Create a Faiss index\n",
+    "index = faiss.IndexFlatL2(embedding_list.shape[1])  # Using L2 distance\n",
+    "index.add(embedding_list)  # Add embeddings to the index\n",
+    "# To search for similar embeddings\n",
+    "query_embedding = model.encode(\"Non Disclosure Agreement\").astype('float32')\n",
+    "D, I = index.search(query_embedding.reshape(1, -1), k=5)  # Search for 5 nearest neighbors\n",
+    "\n",
+    "# Retrieve filenames based on index\n",
+    "for idx in I[0]:\n",
+    "    print(list(pdf_embeddings.keys())[idx])  # Get corresponding filenames\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "d780048e-59b5-4d48-8b94-a90649503197",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "183.pdf\n",
+      "BTS_NDA.pdf\n",
+      "BCG-Mutual-NDA.pdf\n",
+      "NMLS%20Accessibility%20NDA.pdf\n",
+      "5-Appendix-Non-Disclosure-Agreement-Mutual.pdf\n"
+     ]
+    }
+   ],
+   "source": [
+    "# To search for similar embeddings\n",
+    "query_embedding = model.encode(\"What is the primary purpose of this Confidentiality Agreement between BROOKS and the CLIENT?\").astype('float32')\n",
+    "D, I = index.search(query_embedding.reshape(1, -1), k=5)  # Search for 5 nearest neighbors\n",
+    "\n",
+    "# Retrieve filenames based on index\n",
+    "for idx in I[0]:\n",
+    "    print(list(pdf_embeddings.keys())[idx])  # Get corresponding filenames"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "e84264fc-d2fa-4cb4-a97f-1758f391b5d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example text summarization using Hugging Face in LangChain context\n",
+    "sample_text = \"\"\"\n",
+    "\n",
+    "                            NON-DISCLOSURE AGREEMENT\n",
+    "\n",
+    "      The parties to this Agreement are MPD Technologies, Inc. (\"Disclosing\n",
+    "Party\") and the undersigned \"Recipient\". The parties desire that Disclosing\n",
+    "Party disclose certain Information or Items to Recipient, but Disclosing Party\n",
+    "desires to maintain the trade secret, proprietary or private nature of such\n",
+    "Information or Items.\n",
+    "\n",
+    "      As used herein, the following words have the indicated meanings:\n",
+    "\n",
+    "      (i) \"Disclose\" means to reveal, make known, make available, furnish, or\n",
+    "permit access to, whether or not intentionally.\n",
+    "\n",
+    "      (ii) \"Information\" means all oral, written, or other information\n",
+    "whatsoever, including information in documents and other recording media and\n",
+    "information embodied in any item, which in connection with the Matter, is (a)\n",
+    "obtained by Recipient from or through Disclosing Party, (b) obtained by or\n",
+    "through Recipient by an examination of any Item, or (c) created by or through\n",
+    "Recipient with the use of information in (a) or (b). It includes but is not\n",
+    "limited to ideas, inventions, discoveries, formulas, methods, designs, drawings,\n",
+    "specifications, engineering and manufacturing data. This information is limited\n",
+    "to trade secrets and other proprietary or private information of Disclosing\n",
+    "Party or of any third party if disclosed by or through Disclosing Party.\n",
+    "\n",
+    "      (iii) \"Item\" means any system, subsystem, assembly, subassembly, device,\n",
+    "components, product, or machine, work of authorship, or part thereof, or\n",
+    "substance which is disclosed by or through Disclosing Party hereunder, which\n",
+    "embodies trade secret or other proprietary or private information of Disclosing\n",
+    "Party or of any third party if disclosed by or through Disclosing Party.\n",
+    "\n",
+    "      (iv) \"Matter\" means the project or other matter in connection with which\n",
+    "this Agreement is executed. This matter is or relates to the potential\n",
+    "acquisition by the Recipient of a controlling ownership interest in the share\n",
+    "capital or business of the Disclosing Party.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "e757f6a8-073b-49b4-9cdc-16915e5cff90",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\acer\\AppData\\Roaming\\Python\\Python311\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create a Hugging Face summarization pipeline\n",
+    "summarizer = pipeline(\"summarization\", model=\"facebook/bart-large-cnn\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "dca9e4ac-3917-4c35-b13d-4772e0fbb0ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Summary:\n",
+      "The parties to this Agreement are MPD Technologies, Inc. and the undersigned \"Recipient\" The parties desire that Disclosing Party disclose certain Information or Items to Recipient. The parties also desire to maintain the trade secret, proprietary or private nature of such information or Items. \"Information\" means all oral, written, or other information                whatsoever. \"Matter\" means the project or other matter in connection with which this Agreement is executed. This matter is or relates to the potential acquisition by the Recipient of a controlling ownership interest in the share capital or business of the Discloses Party. This information is limited to trade secrets and other proprietary orPrivate information of Disclose Party or of any third party if disclosed by or through DisclOSE Party.\n"
+     ]
+    }
+   ],
+   "source": [
+    "class HuggingFaceSummarizer(BaseLLM):\n",
+    "    def _generate(self, prompts, **kwargs):\n",
+    "        generations = []\n",
+    "        for prompt in prompts:\n",
+    "            summary = summarizer(prompt, max_length=480, min_length=150, do_sample=False)\n",
+    "            # Wrap the summary in the Generation object as required by LangChain\n",
+    "            generations.append([Generation(text=summary[0]['summary_text'])])\n",
+    "        \n",
+    "        # Return an LLMResult which includes generations\n",
+    "        return LLMResult(generations=generations)\n",
+    "\n",
+    "    @property\n",
+    "    def _llm_type(self):\n",
+    "        return \"huggingface\"\n",
+    "\n",
+    "# Instantiate the summarizer\n",
+    "llm = HuggingFaceSummarizer()\n",
+    "\n",
+    "# Create a prompt template for summarization\n",
+    "prompt_template = PromptTemplate(\n",
+    "    input_variables=[\"text\"],\n",
+    "    template=\"Please summarize the following text:\\n{text}\"\n",
+    ")\n",
+    "\n",
+    "# Create the LLM chain for summarization\n",
+    "summarization_chain = LLMChain(llm=llm, prompt=prompt_template)\n",
+    "text_to_summarize = sample_text\n",
+    "summary = summarization_chain.run(text=text_to_summarize)\n",
+    "print(\"Summary:\")\n",
+    "print(summary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e8bb0d9-cb4a-4e0c-97f9-16f5838cb097",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab