Update app.py

Remove the file I/O functions from app.py and rewrite main() to accept a folder

💻

Writing code and entering commands is only available on desktop. Open this page on a larger screen to complete this chapter.

Now that files.py owns all file I/O, app.py can be cleaned up. Three things need to change:

Remove the moved functions. list_files, read_file, and index_folder now live in files.py. The SUPPORTED_EXTENSIONS constant moved there too. Keeping these in app.py would create two copies that could drift out of sync.

Remove the PDF-specific code. The original app.py was built around PDFs — extract_text uses pypdf, chunk_text splits raw text, and print_result displays answers. None of these belong in the new folder-based assistant. import pypdf goes away with them.

Rewrite main(). The new version accepts a folder path instead of a PDF path and a question. It calls index_folder from files.py to get structured chunks, then extracts the plain text strings before embedding.

Notice that embed_all_chunks now takes texts (plain strings extracted from the chunk dicts) instead of the dicts themselves. main() extracts them with [chunk["text"] for chunk in chunks] before passing them to embed_all_chunks.

After this chapter, running python3 app.py ./docs will index the folder and cache the result — but won't yet answer questions. The question-answering loop comes in Lesson 2.

Instructions

Delete the extract_text, chunk_text, print_result, list_files, read_file, and index_folder functions from app.py.
Delete import pypdf.
Delete the SUPPORTED_EXTENSIONS constant — it now lives in files.py.
Add from files import index_folder below the existing imports.
Replace main() with this new version:

def main():
       if len(sys.argv) < 2:
           print("Usage: python app.py <folder>")
           sys.exit(1)
       folder = sys.argv[1]
       cache_path = folder.rstrip("/\\") + ".cache.json"
       client = create_client()
       cached = load_embeddings(cache_path)
       if cached:
           chunks, embeddings = cached
           print(f"Loaded cache from {cache_path}")
       else:
           print(f"Indexing {folder}...")
           chunks = index_folder(folder)
           texts = [chunk["text"] for chunk in chunks]
           file_count = len(set(chunk["source"] for chunk in chunks))
           print(f"Indexed {len(chunks)} chunks from {file_count} files.")
           embeddings = embed_all_chunks(client, texts)
           save_embeddings(chunks, embeddings, cache_path)
           print(f"Cache saved to {cache_path}")

← Previous Chapter Next Chapter →

import json
import os
import sys
import time
import numpy as np
import pypdf  # Step 2: delete this line
from dotenv import load_dotenv
from google import genai
from google.genai import types
# Step 4: add "from files import index_folder" here

# Step 1: delete this function
def extract_text(pdf_path):
    reader = pypdf.PdfReader(pdf_path)
    pages = [page.extract_text() for page in reader.pages]
    return "\n".join(pages)

# Step 1: delete this function
def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i : i + chunk_size])
    return chunks

def create_client():
    load_dotenv()
    api_key = os.getenv("GEMINI_API_KEY")
    client = genai.Client(api_key=api_key)
    return client

def embed_text(client, text):
    result = client.models.embed_content(model="gemini-embedding-001", contents=text, config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT"))
    return result.embeddings[0].values

def embed_all_chunks(client, chunks):
    BATCH_SIZE = 90
    embeddings = []
    for i in range(0, len(chunks), BATCH_SIZE):
        batch = chunks[i : i + BATCH_SIZE]
        for chunk in batch:
            embeddings.append(embed_text(client, chunk))
        if i + BATCH_SIZE < len(chunks):
            print("Rate limit pause — waiting 60 seconds...")
            time.sleep(60)
    return embeddings

def cosine_similarity(vec_a, vec_b):
    dot = np.dot(vec_a, vec_b)
    norm = np.linalg.norm(vec_a) * np.linalg.norm(vec_b)
    return dot / norm

def search(client, query, chunks, embeddings, top_k=3):
    result = client.models.embed_content(model="gemini-embedding-001", contents=query, config=types.EmbedContentConfig(task_type="RETRIEVAL_QUERY"))
    query_vector = result.embeddings[0].values
    scores = [(cosine_similarity(query_vector, emb), chunk) for emb, chunk in zip(embeddings, chunks)]
    scores.sort(key=lambda x: x[0], reverse=True)
    return [chunk for _, chunk in scores[:top_k]]

def build_prompt(question, context_chunks):
    context = "\n\n".join(context_chunks)
    prompt = f"You are a helpful assistant. Answer the question using only the context below.\nIf the answer is not in the context, say \"I don't know.\"\n\nContext:\n{context}\n\nQuestion:\n{question}"
    return prompt

def generate_answer(client, prompt):
    response = client.models.generate_content(model="gemini-2.5-flash", contents=prompt)
    return response.text

# Step 1: delete this function
def print_result(answer, source_chunks, show_sources=True):
    print("Answer:")
    print(answer)
    if show_sources:
        print("\nSources:")
        for i, chunk in enumerate(source_chunks, 1):
            print(f"Source {i}:\n{chunk}\n")

def save_embeddings(chunks, embeddings, cache_path):
    data = {"chunks": chunks, "embeddings": embeddings}
    with open(cache_path, "w") as f:
        json.dump(data, f)

def load_embeddings(cache_path):
    if not os.path.exists(cache_path):
        return None
    with open(cache_path) as f:
        data = json.load(f)
    return data["chunks"], data["embeddings"]

SUPPORTED_EXTENSIONS = {".txt", ".md", ".py", ".js", ".ts", ".yaml", ".yml", ".json"}  # Step 3: delete this line

# Step 1: delete this function
def list_files(folder):
    file_paths = []
    for dirpath, _, filenames in os.walk(folder):
        for filename in filenames:
            _, ext = os.path.splitext(filename)
            if ext in SUPPORTED_EXTENSIONS:
                file_paths.append(os.path.join(dirpath, filename))
    return file_paths

# Step 1: delete this function
def read_file(path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            return f.read()
    except (UnicodeDecodeError, OSError):
        return None

# Step 1: delete this function
def index_folder(folder, chunk_size=500, overlap=100):
    file_paths = list_files(folder)
    chunks = []
    for path in file_paths:
        text = read_file(path)
        if not text:
            continue
        filename = os.path.basename(path)
        for i in range(0, len(text), chunk_size - overlap):
            chunks.append({"text": text[i:i + chunk_size], "source": filename})
    return chunks

# Step 5: replace this entire function with the new version
def main():
    pdf_path = sys.argv[1]
    question = sys.argv[2]
    cache_path = pdf_path + ".cache.json"
    client = create_client()
    print(f"Loading {pdf_path}...")
    cached = load_embeddings(cache_path)
    if cached:
        chunks, embeddings = cached
        print(f"Loaded cache from {cache_path}")
    else:
        text = extract_text(pdf_path)
        chunks = chunk_text(text)
        print(f"No cache found. Embedding {len(chunks)} chunks...")
        embeddings = embed_all_chunks(client, chunks)
        save_embeddings(chunks, embeddings, cache_path)
        print(f"Cache saved to {cache_path}")
    top_chunks = search(client, question, chunks, embeddings)
    prompt = build_prompt(question, top_chunks)
    answer = generate_answer(client, prompt)
    print_result(answer, top_chunks, show_sources=False)

if __name__ == "__main__":
    main()

Update app.py

Instructions

Interactive Code Editor