# LangChain
from langchain_google_community.gcs_file import GCSFileLoader
from langchain_google_community.gcs_directory import GCSDirectoryLoader
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_pinecone.vectorstores import PineconeVectorStore

# Google Cloud
import os
from google.cloud import storage
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file('../credentials.json')
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../credentials.json"


# Pinecone VectorDB
from pinecone import Pinecone
from pinecone import ServerlessSpec

# API Keys
from dotenv import load_dotenv
load_dotenv()

True

from typing import Dict

def metadata_func(record: Dict[str, str], metadata: Dict[str, str]) -> Dict[str, str]:
    metadata["title"] = record.get("title")
    metadata["source"] = record.get("source")
    metadata["url"] = record.get("url")
    metadata["filename"] = record.get("filename")

    return metadata

    
def load_json(file_path: str, jq_schema: str="."):
    return JSONLoader(
                file_path, 
                jq_schema=jq_schema, 
                text_content=False,
                content_key="text",
                metadata_func=metadata_func
)

loader = GCSFileLoader(project_name=credentials.project_id,
                       bucket="kennedyskis",
                       blob="1st-nixon-kennedy-debate-19600926.json",
                       loader_func=load_json)

document = loader.load()

print(document[0].page_content[:1000])

[Text, format, and style are as published in Freedom of Communications: Final Report of the Committee on Commerce, United States Senate..., Part III: The Joint Appearances of Senator John F. Kennedy and Vice President Richard M. Nixon and Other 1960 Campaign Presentations. 87th Congress, 1st Session, Senate Report No. 994, Part 3. Washington: U.S. Government Printing Office, 1961.]
Monday, September 26, 1960
Originating CBS, Chicago, Ill., All Networks carried.
Moderator, Howard K. Smith.
MR. SMITH: Good evening.
The television and radio stations of the United States and their affiliated stations are proud to provide facilities for a discussion of issues in the current political campaign by the two major candidates for the presidency.
The candidates need no introduction. The Republican candidate, Vice President Richard M. Nixon, and the Democratic candidate, Senator John F. Kennedy.
According to rules set by the candidates themselves, each man shall make an opening statement of approx

document[0].metadata

{'source': 'gs://kennedyskis/1st-nixon-kennedy-debate-19600926.json',
 'seq_num': 1,
 'title': 'Senator John F. Kennedy and Vice President Richard M. Nixon First Joint Radio-Television Broadcast, September 26, 1960',
 'url': 'https://www.jfklibrary.org//archives/other-resources/john-f-kennedy-speeches/1st-nixon-kennedy-debate-19600926',
 'filename': '1st-nixon-kennedy-debate-19600926'}

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
documents = text_splitter.split_documents(document)

print("Number of documents: ", len(documents))

Number of documents:  429

for n, doc in enumerate(documents[:3]):
    print(f"Doc {n}: ", doc.page_content, "\n", "\tMetadata:", doc.metadata, "\n")

Doc 0:  [Text, format, and style are as published in Freedom of Communications: Final Report of the Committee on Commerce, United States Senate..., Part III: The Joint Appearances of Senator John F. Kennedy 
 	Metadata: {'source': 'gs://kennedyskis/1st-nixon-kennedy-debate-19600926.json', 'seq_num': 1, 'title': 'Senator John F. Kennedy and Vice President Richard M. Nixon First Joint Radio-Television Broadcast, September 26, 1960', 'url': 'https://www.jfklibrary.org//archives/other-resources/john-f-kennedy-speeches/1st-nixon-kennedy-debate-19600926', 'filename': '1st-nixon-kennedy-debate-19600926'} 

Doc 1:  John F. Kennedy and Vice President Richard M. Nixon and Other 1960 Campaign Presentations. 87th Congress, 1st Session, Senate Report No. 994, Part 3. Washington: U.S. Government Printing Office, 
 	Metadata: {'source': 'gs://kennedyskis/1st-nixon-kennedy-debate-19600926.json', 'seq_num': 1, 'title': 'Senator John F. Kennedy and Vice President Richard M. Nixon First Joint Radio-Television Broadcast, September 26, 1960', 'url': 'https://www.jfklibrary.org//archives/other-resources/john-f-kennedy-speeches/1st-nixon-kennedy-debate-19600926', 'filename': '1st-nixon-kennedy-debate-19600926'} 

Doc 2:  Printing Office, 1961.] 
 	Metadata: {'source': 'gs://kennedyskis/1st-nixon-kennedy-debate-19600926.json', 'seq_num': 1, 'title': 'Senator John F. Kennedy and Vice President Richard M. Nixon First Joint Radio-Television Broadcast, September 26, 1960', 'url': 'https://www.jfklibrary.org//archives/other-resources/john-f-kennedy-speeches/1st-nixon-kennedy-debate-19600926', 'filename': '1st-nixon-kennedy-debate-19600926'}

embedding = embedding = NVIDIAEmbeddings(
                            model="nvidia/llama-3.2-nv-embedqa-1b-v2",
                            api_key=os.getenv("NVIDIA_API_KEY"),
                            dimension=2048,
                            truncate="NONE")

query = embedding.embed_query(documents[0].page_content)

print("First 5 entries in embedded document:", query[:5])

First 5 entries in embedded document: [-0.00730133056640625, 0.01448822021484375, 0.01450347900390625, 0.00974273681640625, 0.0265350341796875]

print("Vector size:", len(query))

Vector size: 2048

client = storage.Client(project=credentials.project_id,
                        credentials=credentials)

bucket = client.get_bucket("prezkennedyspeches")

speeches = [blob.name for blob in bucket.list_blobs()]
print(f"JFK had {len(speeches)} speeches in his presidency.")

JFK had 22 speeches in his presidency.

speeches

['american-newspaper-publishers-association-19610427.json',
 'american-society-of-newspaper-editors-19610420.json',
 'american-university-19630610.json',
 'americas-cup-dinner-19620914.json',
 'berlin-crisis-19610725.json',
 'berlin-w-germany-rudolph-wilde-platz-19630626.json',
 'civil-rights-radio-and-television-report-19630611.json',
 'cuba-radio-and-television-report-19621022.json',
 'inaugural-address-19610120.json',
 'inaugural-anniversary-19620120.json',
 'irish-parliament-19630628.json',
 'latin-american-diplomats-washington-dc-19610313.json',
 'massachusetts-general-court-19610109.json',
 'peace-corps-establishment-19610301.json',
 'philadelphia-pa-19620704.json',
 'rice-university-19620912.json',
 'united-nations-19610925.json',
 'united-states-congress-special-message-19610525.json',
 'university-of-california-berkeley-19620323.json',
 'university-of-mississippi-19620930.json',
 'vanderbilt-university-19630518.json',
 'yale-university-19620611.json']

loader = GCSDirectoryLoader(
                project_name=credentials.project_id,
                bucket="prezkennedyspeches",
                loader_func=load_json
)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)

documents = loader.load_and_split(text_splitter)
print(f"There are {len(documents)} documents")

There are 180 documents

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "prez-speeches"
dim = 2048

# delete the index if it exists
if pc.has_index(index_name):
    pc.delete_index(index_name)

# create the index
pc.create_index(
        name=index_name,
        dimension=dim,
        metric="cosine",
        spec=ServerlessSpec(
                  cloud="aws",
                  region="us-east-1"
        )
)

{
    "name": "prez-speeches",
    "metric": "cosine",
    "host": "prez-speeches-2307pwa.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 2048,
    "deletion_protection": "disabled",
    "tags": null
}

print(pc.Index(index_name).describe_index_stats())

/Users/mikeharmon/miniconda3/envs/llm_env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

{'dimension': 2048,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

vectordb = PineconeVectorStore(
                    pinecone_api_key=os.getenv("PINECONE_API_KEY"),
                    embedding=embedding,
                    index_name=index_name
)

vectordb = vectordb.from_documents(
                            documents=documents, 
                            embedding=embedding, 
                            index_name=index_name
)

print(pc.Index(index_name).describe_index_stats())

{'dimension': 2048,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 180}},
 'total_vector_count': 180,
 'vector_type': 'dense'}

index = pc.Index(index_name)

question = "How did Kennedy feel about the Berlin Wall?"

query = embedding.embed_query(question)

matches = index.query(vector=query, top_k=5)
matches

{'matches': [{'id': 'b9e573a6-d9f9-4306-a6e3-72ac769643dd',
              'score': 0.436862975,
              'values': []},
             {'id': 'd0245e9a-b4f2-46e6-a6d0-07ee3afbad16',
              'score': 0.422326,
              'values': []},
             {'id': 'a6bcd4fa-90a3-46b2-a48d-105115ccaed7',
              'score': 0.394667208,
              'values': []},
             {'id': 'ffe2db4a-6983-4cde-a853-658080619575',
              'score': 0.35799697,
              'values': []},
             {'id': 'b7c5ebca-1886-4670-9acd-55ce4e402c2c',
              'score': 0.352600902,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

id = matches["matches"][0].get('id')

result = index.fetch([id])
result.vectors[id]["metadata"]

{'filename': 'berlin-w-germany-rudolph-wilde-platz-19630626',
 'seq_num': 1.0,
 'source': 'gs://prezkennedyspeches/berlin-w-germany-rudolph-wilde-platz-19630626.json',
 'text': 'Freedom has many difficulties and democracy is not perfect, but we have never had to put a wall up to keep our people in, to prevent them from leaving us. I want to say, on behalf of my countrymen, who live many miles away on the other side of the Atlantic, who are far distant from you, that they take the greatest pride that they have been able to share with you, even from a distance, the story of the last 18 years. I know of no town, no city, that has been besieged for 18 years that still lives with the vitality and the force, and the hope and the determination of the city of West Berlin. While the wall is the most obvious and vivid demonstration of the failures of the Communist system, for all the world to see, we take no satisfaction in it, for it is, as your Mayor has said, an offense not only against history but an offense against humanity, separating families, dividing husbands and wives and brothers and sisters, and dividing a people who wish to be joined together.\nWhat is true of this city is true of Germany--real, lasting peace in Europe can never be assured as long as one German out of four is denied the elementary right of free men, and that is to make a free choice. In 18 years of peace and good faith, this generation of Germans has earned the right to be free, including the right to unite their families and their nation in lasting peace, with good will to all people. You live in a defended island of freedom, but your life is part of the main. So let me ask you as I close, to lift your eyes beyond the dangers of today, to the hopes of tomorrow, beyond the freedom merely of this city of Berlin, or your country of Germany, to the advance of freedom everywhere, beyond the wall to the day of peace with justice, beyond yourselves and ourselves to all mankind.',
 'title': 'Remarks of President John F. Kennedy at the Rudolph Wilde Platz, Berlin, June 26, 1963',
 'url': 'https://www.jfklibrary.org//archives/other-resources/john-f-kennedy-speeches/berlin-w-germany-rudolph-wilde-platz-19630626'}

results = vectordb.search(query=question, search_type="similarity")

results[0].metadata

{'filename': 'berlin-w-germany-rudolph-wilde-platz-19630626',
 'seq_num': 1.0,
 'source': 'gs://prezkennedyspeches/berlin-w-germany-rudolph-wilde-platz-19630626.json',
 'title': 'Remarks of President John F. Kennedy at the Rudolph Wilde Platz, Berlin, June 26, 1963',
 'url': 'https://www.jfklibrary.org//archives/other-resources/john-f-kennedy-speeches/berlin-w-germany-rudolph-wilde-platz-19630626'}

Retrievial Augmented Generation On JFK Speeches: Part 1¶

1. Introduction ¶

2. Scraping JFK Speeches using Asyncio ¶

3. Loading and Embedding Speeches ¶

4. Ingesting Speeches Into A Pinecone Vector Database ¶

5. Next Steps ¶