New

Launch Open-Source Apps with LangChain

Notebook

SingleStore Notebooks

Launch Open-Source Apps with LangChain

In [1]:

%%writefile requirements.txt
jinja2==3.0.3
langchain==0.0.339
openai==1.3.3
pdf2image
pdfminer
pdfminer.six
pillow_heif
tabulate
tiktoken
unstructured
opencv-python-headless
unstructured.pytesseract
unstructured.inference

In [2]:

%pip install -r requirements.txt --quiet

In [3]:

from langchain.document_loaders import OnlinePDFLoader
loader = OnlinePDFLoader("http://leavcom.com/pdf/DBpdf.pdf")
data = loader.load()

In [4]:

from langchain.text_splitter import RecursiveCharacterTextSplitter
print (f"You have {len(data)} document(s) in your data")
print (f"There are {len(data[0].page_content)} characters in your document")

In [5]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
print (f"You have {len(texts)} pages")

In [6]:

%%sql
DROP DATABASE IF EXISTS pdf_db;
CREATE DATABASE IF NOT EXISTS pdf_db;

Action Required

Make sure to select the pdf_db database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

In [7]:

%%sql
DROP TABLE IF EXISTS pdf_docs1;
CREATE TABLE IF NOT EXISTS pdf_docs1 (
id INT PRIMARY KEY,
content TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
vector BLOB
);

In [8]:

import os
import getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [9]:

import json
import sqlalchemy as sa
from langchain.embeddings import OpenAIEmbeddings
from singlestoredb import create_engine
conn = create_engine().connect()
embedder = OpenAIEmbeddings()
# Fetch all embeddings in one call
embeddings = embedder.embed_documents([doc.page_content for doc in texts])
# Build query parameters
params = []
for i, (text_content, embedding) in enumerate(zip(texts, embeddings)):
params.append(dict(id=i+1, content=text_content, vector=json.dumps(embedding)))
stmt = sa.text("""
INSERT INTO pdf_docs1 (
id,
content,
vector
)
VALUES (
:id,
:content,
JSON_ARRAY_PACK_F32(:vector)
)
""")
conn.execute(stmt, params)

In [10]:

%%sql
SELECT JSON_ARRAY_UNPACK_F32(vector) as vector
FROM pdf_docs1
LIMIT 1;

In [11]:

query_text = "Will object-oriented databases be commercially successful?"
query_embedding = embedder.embed_documents([query_text])[0]
stmt = sa.text("""
SELECT
content,
DOT_PRODUCT_F32(JSON_ARRAY_PACK_F32(:embedding), vector) AS score
FROM pdf_docs1
ORDER BY score DESC
LIMIT 1
""")
results = conn.execute(stmt, dict(embedding=json.dumps(query_embedding)))
for row in results:
print(row[0])

In [12]:

import openai
client = openai.OpenAI()
prompt = f"The user asked: {query_text}. The most similar text from the document is: {row[0]}"
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
)
print(response.choices[0].message.content)

Clean up

In [13]:

%%sql
DROP DATABASE IF EXISTS pdf_db

Details

Tags

#vectordb#genai#langchain

License

This Notebook has been released under the Apache 2.0 open source license.