Introduction#
GitHub this note shows how to host a embedding vector database on aurora posgresql and build a chatbot with streamlit
- Setup Aurora PostgreSQL and pgvector extension
- LangChain and HuggingFace embeedings
- Build chatbot with streamlit
[!NOTE]
Please register a Hugging Face token as this architecture still using Hugging API for chatbot while the embedding vector database hosted on Aurora. The default database name is postgres in this case, you can create a inital database name from aws console when creating a new Aurora DB.
Setup Aurora#
- Create Aurora PostgreSQL
- Install pgvector extension
Please use Aurora PostgreSQL v15.3 with pgvector support. To connect EC2 (Amazon Linux 2023) to Arurora PostgreSQL, we need to install psql client.
sudo dnf install postgresql15
Then connect
psql \--host=<DB instance endpoint> \--port=<port> \--username=<master username> \--password \--dbname=<database name>
Install the pgvector extension
CREATE EXTENSION vector;
LangChain and HF Embedding#
Import libs and load env
from dotenv import load_dotenvfrom langchain.document_loaders import CSVLoaderfrom langchain.text_splitter import CharacterTextSplitterfrom langchain.embeddings import HuggingFaceInstructEmbeddingsfrom langchain.vectorstores.pgvector import PGVector, DistanceStrategyfrom langchain.docstore.document import Documentimportload_dotenv(dotenv_path="./env")
Let load document, split using LangChain
loader = CSVLoader("./data/fictitious_hotel_reviews_trimmed_500.csv")documents = loader.load()
Split text to chunk with overlaps
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)docs = text_splitter.split_documents(documents=documents)
Then create a HF embeddings
embeddings = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
Convert text to embedding vector
x = embeddings.embed_documents(["how are you doing today"])
Integrate with LangChain to write to database. First create a connection to aurora
connection_string = PGVector.connection_string_from_db_params(driver = os.environ.get("PGVECTOR_DRIVER"),user = os.environ.get("PGVECTOR_USER"),password = os.environ.get("PGVECTOR_PASSWORD"),host = os.environ.get("PGVECTOR_HOST"),port = os.environ.get("PGVECTOR_PORT"),database = os.environ.get("PGVECTOR_DATABASE"))
Then write to the database
from typing import List, Tuplecollection_name = "fictitious_hotel_reviews"db = PGVector.from_documents(embedding=embeddings,documents=docs,collection_name=collection_name,connection_string=connection_string)
Let try query similarity
query = "What do some of the positive reviews say?"docs_with_score: List[Tuple[Document, float]] = db.similarity_search_with_score(query)
Check result
for doc, score in docs_with_score:print("-" * 80)print("Score: ", score)print(doc.page_content)print(doc.metadata)print("-" * 80)
Streamlit ChatBot#
Please double check requirements.txt and htmlTemplates.py
langchainpython-dotenvjupyterpsycopgpsycopg2-binarypgvectorhuggingface-hubInstructorEmbeddingsentence-transformersstreamlitaltairurllib3PyPDF2
Let create app.py and build a chatbot using streamlit
import streamlit as stfrom dotenv import load_dotenvfrom PyPDF2 import PdfReaderfrom langchain.embeddings import HuggingFaceInstructEmbeddingsfrom langchain.llms import HuggingFaceHubfrom langchain.vectorstores.pgvector import PGVectorfrom langchain.memory import ConversationBufferMemoryfrom langchain.chains import ConversationalRetrievalChainfrom htmlTemplates import css, bot_template, user_templatefrom langchain.text_splitter import RecursiveCharacterTextSplitterimport osdef get_pdf_text(pdf_docs):text = ""for pdf in pdf_docs:pdf_reader = PdfReader(pdf)for page in pdf_reader.pages:text += page.extract_text()return textdef get_text_chunks(text):text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", ".", " "],chunk_size=1000,chunk_overlap=200,length_function=len)chunks = text_splitter.split_text(text)return chunksdef get_vectorstore(text_chunks):embeddings = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")if text_chunks is None:return PGVector(connection_string=CONNECTION_STRING,embedding_function=embeddings,)return PGVector.from_texts(texts=text_chunks, embedding=embeddings, connection_string=CONNECTION_STRING)def get_conversation_chain(vectorstore):llm = HuggingFaceHub(repo_id="MBZUAI/LaMini-Flan-T5-783M", model_kwargs={"temperature":0.2, "max_length":4096})memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm,chain_type="stuff",retriever=vectorstore.as_retriever(search_kwargs={"k": 1}),memory=memory)return conversation_chaindef handle_userinput(user_question):try:response = st.session_state.conversation({'question': user_question})except ValueError:st.write("Sorry, please ask again in a different way.")returnst.session_state.chat_history = response['chat_history']for i, message in enumerate(st.session_state.chat_history):if i % 2 == 0:st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)else:st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)def main():st.set_page_config(page_title="Streamlit Question Answering App",layout="wide",page_icon=":books::parrot:")st.write(css, unsafe_allow_html=True)st.sidebar.markdown("""### Instructions:1. Browse and upload PDF files2. Click Process3. Type your question in the search bar to get more insights""")if "conversation" not in st.session_state:st.session_state.conversation = get_conversation_chain(get_vectorstore(None))if "chat_history" not in st.session_state:st.session_state.chat_history = Nonest.header("GenAI Q&A with pgvector and Amazon Aurora PostgreSQL :books::parrot:")user_question = st.text_input("Ask a question about your documents:")if user_question:handle_userinput(user_question)with st.sidebar:st.subheader("Your documents")pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", type="pdf", accept_multiple_files=True)if st.button("Process"):with st.spinner("Processing"):# get pdf textraw_text = get_pdf_text(pdf_docs)# get the text chunkstext_chunks = get_text_chunks(raw_text)# create vector storevectorstore = get_vectorstore(text_chunks)# create conversation chainst.session_state.conversation = get_conversation_chain(vectorstore)st.success('PDF uploaded successfully!', icon="✅")if __name__ == '__main__':load_dotenv("./env")CONNECTION_STRING = PGVector.connection_string_from_db_params(driver = os.environ.get("PGVECTOR_DRIVER"),user = os.environ.get("PGVECTOR_USER"),password = os.environ.get("PGVECTOR_PASSWORD"),host = os.environ.get("PGVECTOR_HOST"),port = os.environ.get("PGVECTOR_PORT"),database = os.environ.get("PGVECTOR_DATABASE"))main()
Then run the app
streamlit run app.py