Pass History to the Prompt

Update build_prompt to accept conversation history and embed it between the context and the question

💻

Writing code and entering commands is only available on desktop. Open this page on a larger screen to complete this chapter.

Where history belongs in the prompt

The format_history function converts the message list to a text block. That block needs to appear in the prompt after the retrieved context and before the question, so the model can use both the document context and the conversation history when formulating an answer.

Adding a history parameter to build_prompt keeps history injection in one place. The default value of None means existing callers that don't pass history continue to work without modification.

Instructions

Add a new parameter named history to the list of parameters of the build_prompt function, after context_chunks and before file_list. The default value is None.
On the line after context = ..., add history_text = format_history(history or []).
The history block must sit between the context and the question. Edit the return string:
- Change f"Context:\n{context}\n\n" to f"Context:\n{context}" — remove the trailing double newline, because history_text already starts with its own newline.
- Add f"{history_text}\n\n" on the next line, before f"Question:\n{question}".

← Previous Chapter Next Chapter →

import json
import os
import sys
import time
import numpy as np
from dotenv import load_dotenv
from google import genai
from google.genai import types
from files import index_folder

def create_client():
    load_dotenv()
    api_key = os.getenv("GEMINI_API_KEY")
    client = genai.Client(api_key=api_key)
    return client

def embed_text(client, text):
    result = client.models.embed_content(model="gemini-embedding-001", contents=text, config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT"))
    return result.embeddings[0].values

def embed_all_chunks(client, texts):
    BATCH_SIZE = 90
    embeddings = []
    for i in range(0, len(texts), BATCH_SIZE):
        batch = texts[i : i + BATCH_SIZE]
        for text in batch:
            embeddings.append(embed_text(client, text))
        if i + BATCH_SIZE < len(texts):
            print("Rate limit pause — waiting 60 seconds...")
            time.sleep(60)
    return embeddings

def cosine_similarity(vec_a, vec_b):
    dot = np.dot(vec_a, vec_b)
    norm = np.linalg.norm(vec_a) * np.linalg.norm(vec_b)
    return dot / norm

def search(client, query, chunks, embeddings, top_k=3):
    result = client.models.embed_content(model="gemini-embedding-001", contents=query, config=types.EmbedContentConfig(task_type="RETRIEVAL_QUERY"))
    query_vector = result.embeddings[0].values
    scores = [(cosine_similarity(query_vector, emb), chunk) for emb, chunk in zip(embeddings, chunks)]
    scores.sort(key=lambda x: x[0], reverse=True)
    return [chunk for _, chunk in scores[:top_k]]

# Step 1: add history=None parameter
def build_prompt(question, context_chunks, file_list=None):
    context = "\n\n".join(chunk["text"] for chunk in context_chunks)
    # Step 2:
    files_line = ""
    if file_list:
        files_line = f"You have access to these files: {', '.join(file_list)}\n"
    return (
        f"{files_line}"
        "You are a helpful assistant. Answer the question using only the context below.\n"
        "If the answer is not in the context, say \"I don't know.\"\n\n"
        # Step 3:
        f"Context:\n{context}\n\n"
        f"Question:\n{question}"
    )

def generate_answer(client, prompt):
    response = client.models.generate_content(model="gemini-2.5-flash", contents=prompt)
    return response.text

def save_embeddings(chunks, embeddings, cache_path):
    data = {"chunks": chunks, "embeddings": embeddings}
    with open(cache_path, "w") as f:
        json.dump(data, f)

def load_embeddings(cache_path):
    if not os.path.exists(cache_path):
        return None
    with open(cache_path) as f:
        data = json.load(f)
    return data["chunks"], data["embeddings"]

def format_history(messages):
    if not messages:
        return ""
    lines = ["\nConversation so far:"]
    for msg in messages:
        role = "You" if msg["role"] == "user" else "Assistant"
        lines.append(f"{role}: {msg['content']}")
    return "\n".join(lines)

def chat_loop(client, chunks, embeddings):
    file_list = sorted(set(chunk["source"] for chunk in chunks))
    print("Assistant ready. Type your question, or /help for commands.\n")
    while True:
        question = input("You: ").strip()
        if not question:
            continue
        top_chunks = search(client, question, chunks, embeddings)
        prompt = build_prompt(question, top_chunks, file_list)
        answer = generate_answer(client, prompt)
        print(f"Assistant: {answer}")

def main():
    if len(sys.argv) < 2:
        print("Usage: python app.py <folder>")
        sys.exit(1)
    folder = sys.argv[1]
    cache_path = folder.rstrip("/\\") + ".cache.json"
    client = create_client()
    cached = load_embeddings(cache_path)
    if cached:
        chunks, embeddings = cached
        print(f"Loaded cache from {cache_path}")
    else:
        print(f"Indexing {folder}...")
        chunks = index_folder(folder)
        texts = [chunk["text"] for chunk in chunks]
        file_count = len(set(chunk["source"] for chunk in chunks))
        print(f"Indexed {len(chunks)} chunks from {file_count} files.")
        embeddings = embed_all_chunks(client, texts)
        save_embeddings(chunks, embeddings, cache_path)
        print(f"Cache saved to {cache_path}")
    chat_loop(client, chunks, embeddings)

if __name__ == "__main__":
    main()

Pass History to the Prompt

Where history belongs in the prompt

Instructions

Interactive Code Editor