New

Launch Open-Source Apps with LangChain

Notebook

SingleStore Notebooks

Launch Open-Source Apps with LangChain

In [1]:

%%writefile requirements.txt
jinja2==3.0.3
langchain==0.0.339
openai==1.3.3
pdf2image
pdfminer
pdfminer.six
pillow_heif
tabulate
tiktoken
unstructured
opencv-python-headless
unstructured.pytesseract
unstructured.inference

In [2]:

%pip install -r requirements.txt --quiet

In [3]:

from langchain.document_loaders import OnlinePDFLoader
loader = OnlinePDFLoader("http://leavcom.com/pdf/DBpdf.pdf")
data = loader.load()

In [4]:

from langchain.text_splitter import RecursiveCharacterTextSplitter
print (f"You have {len(data)} document(s) in your data")
print (f"There are {len(data[0].page_content)} characters in your document")

In [5]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
print (f"You have {len(texts)} pages")

In [6]:

%%sql
DROP DATABASE IF EXISTS pdf_db;
CREATE DATABASE IF NOT EXISTS pdf_db;

Action Required

Make sure to select the pdf_db database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

In [7]:

%%sql
DROP TABLE IF EXISTS pdf_docs1;
CREATE TABLE IF NOT EXISTS pdf_docs1 (
id INT PRIMARY KEY,
content TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
vector BLOB
);

In [8]:

import os
import getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [9]:

import json
import sqlalchemy as sa
from langchain.embeddings import OpenAIEmbeddings
from singlestoredb import create_engine
conn = create_engine().connect()
embedder = OpenAIEmbeddings()
# Fetch all embeddings in one call
embeddings = embedder.embed_documents([doc.page_content for doc in texts])
# Build query parameters
params = []
for i, (text_content, embedding) in enumerate(zip(texts, embeddings)):
params.append(dict(id=i+1, content=text_content, vector=json.dumps(embedding)))
stmt = sa.text("""
INSERT INTO pdf_docs1 (
id,
content,
vector
)
VALUES (
:id,
:content,
JSON_ARRAY_PACK_F32(:vector)
)
""")
conn.execute(stmt, params)

In [10]:

%%sql
SELECT JSON_ARRAY_UNPACK_F32(vector) as vector
FROM pdf_docs1
LIMIT 1;

In [11]:

query_text = "Will object-oriented databases be commercially successful?"
query_embedding = embedder.embed_documents([query_text])[0]
stmt = sa.text("""
SELECT
content,
DOT_PRODUCT_F32(JSON_ARRAY_PACK_F32(:embedding), vector) AS score
FROM pdf_docs1
ORDER BY score DESC
LIMIT 1
""")
results = conn.execute(stmt, dict(embedding=json.dumps(query_embedding)))
for row in results:
print(row[0])

In [12]:

import openai
client = openai.OpenAI()
prompt = f"The user asked: {query_text}. The most similar text from the document is: {row[0]}"
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
)
print(response.choices[0].message.content)

Clean up

In [13]:

%%sql
DROP DATABASE IF EXISTS pdf_db

Details

About this Template

LangChain connector to use SingleStoreDB as your vector database for your apps.

Tags

vectordbgenailangchain

License

This Notebook has been released under the Apache 2.0 open source license.