Python Coding

LLM Model API 
 LMStudio 
 from langchain.llms import OpenAI

#set llm for langchain using model from lmstudio
llm = OpenAI(
 openai_api_base='http://localhost:1234/v1',
 openai_api_key='NULL'
 ) 
 import streamlit as st
from openai import OpenAI

# Set up the Streamlit App
st.title("ChatGPT Clone using Llama-3 🦙")
st.caption("Chat with locally hosted Llama-3 using the LM Studio 💯")

# Point to the local server setup using LM Studio
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

# Initialize the chat history
if "messages" not in st.session_state:
 st.session_state.messages = []

# Display the chat history
for message in st.session_state.messages:
 with st.chat_message(message["role"]):
 st.markdown(message["content"])

# Accept user input
if prompt := st.chat_input("What is up?"):
 # Add user message to chat history
 st.session_state.messages.append({"role": "user", "content": prompt})
 # Display user message in chat message container
 with st.chat_message("user"):
 st.markdown(prompt)
 # Generate response
 response = client.chat.completions.create(
 model="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
 messages=st.session_state.messages, temperature=0.7
 )
 # Add assistant response to chat history
 st.session_state.messages.append({"role": "assistant", "content": response.choices[0].message.content})
 # Display assistant response in chat message container
 with st.chat_message("assistant"):
 st.markdown(response.choices[0].message.content) 
 GPT 
 from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
 model="gpt-4o",
 temperature=0,
 max_tokens=None,
 timeout=None,
 max_retries=2,
 # api_key="...",
 # base_url="...",
 # organization="...",
 # other params...
) 
 Ollama 
 from langchain_community.llms import Ollama

llm = Ollama(model="llama2:13b")
llm.invoke("The first man on the moon was ... think step by step") 
 Chunking/Splitting 
 中文句子切割 
 # Unicode 編碼
# \u3002 全形句號
# \uff0c 全形逗號
# Get Unicode for specific character
# >>> '，'.encode('unicode-escape') # for py3
# >>> list(u'，') # for py2

import re
text = "這是中文句子。第一段，第二段，第三段。"
chunks = re.split('[\u3002\uff0c]', text)
#print("\n\n".join([chunk for chunk in chunks]))
for chunk in chunks:
 print("---" * 10)
 print(chunk) 
 英文句子切割 
 # \s+ 單或多個空白
chunks = re.split(r'(?<=[.?!])\s+', text) 
 PDF to Markdown 
 
 僅限文字內容，無法處裡其他格式，例如圖片等。 
 pdf_to_md.ipynb 
 
 import os
import pymupdf.layout
import pymupdf4llm
from pathlib import Path
import glob

os.environ["TOKENIZERS_PARALLELISM"] = "false"

def pdf_to_markdown(pdf_path, output_dir):
 doc = pymupdf.open(pdf_path)
 md = pymupdf4llm.to_markdown(doc, header=False, footer=False, page_separators=True, ignore_images=True, write_images=False, image_path=None)
 md_cleaned = md.encode('utf-8', errors='surrogatepass').decode('utf-8', errors='ignore')
 output_path = Path(output_dir) / Path(doc.name).stem
 Path(output_path).with_suffix(".md").write_bytes(md_cleaned.encode('utf-8'))

def pdfs_to_markdowns(path_pattern, overwrite: bool = False):
 output_dir = Path(MARKDOWN_DIR)
 output_dir.mkdir(parents=True, exist_ok=True)

 for pdf_path in map(Path, glob.glob(path_pattern)):
 md_path = (output_dir / pdf_path.stem).with_suffix(".md")
 if overwrite or not md_path.exists():
 pdf_to_markdown(pdf_path, output_dir)

pdfs_to_markdowns(f"{DOCS_DIR}/*.pdf") 
 Prompt 
 def get_rag_agent_prompt() -> str:
 return """
 You are a retrieval-augmented assistant.

 You are NOT allowed to answer immediately.

 Before producing ANY final answer, you must first perform a document search
 and observe retrieved content.

 If you have not searched, the answer is invalid.

 Workflow:
 1. Search the documents using the user query.
 2. Inspect retrieved excerpts and keep only relevant ones.
 ...
 """