AI Document Agent | KlarkLabs

Background

A Parisian law firm specializing in corporate law and mergers & acquisitions was processing hundreds of contractual documents every week: due diligence reports, NDAs, share purchase agreements, and shareholder pacts. Lawyers were spending 60 to 70% of their time on document reading and standardized information extraction — work with an apparent high added value but in reality repetitive for highly qualified professionals.

The initial request was a simple clause extraction tool. The delivered solution went much further: an autonomous AI agent capable of understanding document context, extracting structured information, detecting unusual or risky clauses, and producing structured summary memos.

Technical Challenges

Processing Complex Documents

Legal documents present specific challenges: dense multi-page content, cross-references between articles, tables, annexes, and signature blocks. The ingestion pipeline had to preserve the logical structure of the document.

# document_processor/ingestion.py
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone
import hashlib
 
class LegalDocumentProcessor:
    def __init__(self):
        self.embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
        self.pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
        self.index = self.pc.Index("legal-docs")
 
    def process_document(self, file_path: str, metadata: dict) -> str:
        loader = UnstructuredPDFLoader(
            file_path,
            mode="elements",
            strategy="hi_res",
            infer_table_structure=True,
        )
        elements = loader.load()
 
        # Reconstruct logical sections from document elements
        sections = self._reconstruct_sections(elements)
 
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1500,
            chunk_overlap=300,
            separators=["\n\nArticle ", "\n\n", "\n", ". "]
        )
 
        doc_id = hashlib.sha256(file_path.encode()).hexdigest()[:16]
        vectors = []
 
        for i, chunk in enumerate(splitter.split_documents(sections)):
            embedding = self.embeddings.embed_query(chunk.page_content)
            vectors.append({
                "id": f"{doc_id}-{i}",
                "values": embedding,
                "metadata": {
                    **metadata,
                    "doc_id": doc_id,
                    "chunk_index": i,
                    "text": chunk.page_content,
                }
            })
 
        self.index.upsert(vectors=vectors, namespace=metadata["client_id"])
        return doc_id
 
    def _reconstruct_sections(self, elements):
        # Groups elements by logical section
        # using headings as section markers
        sections = []
        current_section = []
        for el in elements:
            if el.metadata.get("category") == "Title":
                if current_section:
                    sections.append("\n".join(current_section))
                current_section = [el.page_content]
            else:
                current_section.append(el.page_content)
        if current_section:
            sections.append("\n".join(current_section))
        return sections

Multi-Step Analysis Agent

The agent uses a ReAct (Reasoning + Acting) architecture with specialized tools for each type of analysis.

# agent/legal_agent.py
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_openai import ChatOpenAI
from langchain.tools import tool
 
@tool
def extract_parties(doc_id: str) -> dict:
    """Extracts the parties (name, role, legal representative) from a contract."""
    chunks = retrieve_relevant_chunks(doc_id, "contracting parties signatories representatives")
    response = llm.invoke(EXTRACTION_PROMPT.format(
        task="parties",
        schema='{"parties": [{"name": str, "role": str, "legal_rep": str}]}',
        context="\n".join(chunks)
    ))
    return json.loads(response.content)
 
@tool
def detect_unusual_clauses(doc_id: str, contract_type: str) -> list[dict]:
    """Detects unusual or potentially risky clauses."""
    reference_clauses = get_standard_clauses(contract_type)
    doc_chunks = retrieve_all_chunks(doc_id)
 
    unusual = []
    for chunk in doc_chunks:
        similarity = compute_clause_similarity(chunk, reference_clauses)
        if similarity < 0.65:
            unusual.append({
                "text": chunk,
                "risk_level": classify_risk(chunk),
                "explanation": generate_risk_explanation(chunk)
            })
    return unusual
 
@tool
def generate_summary_memo(doc_id: str, language: str = "en") -> str:
    """Generates a structured legal summary memo."""
    parties = extract_parties(doc_id)
    key_terms = extract_key_terms(doc_id)
    unusual_clauses = detect_unusual_clauses(doc_id, "generic")
 
    return llm.invoke(MEMO_PROMPT.format(
        parties=parties,
        key_terms=key_terms,
        unusual_clauses=unusual_clauses,
        language=language
    )).content
 
legal_agent = AgentExecutor(
    agent=create_openai_tools_agent(
        llm=ChatOpenAI(model="gpt-4o", temperature=0),
        tools=[extract_parties, detect_unusual_clauses, generate_summary_memo],
        prompt=LEGAL_AGENT_PROMPT,
    ),
    tools=[extract_parties, detect_unusual_clauses, generate_summary_memo],
    verbose=True,
    max_iterations=10,
)

FastAPI with Job Queue

# api/main.py
from fastapi import FastAPI, BackgroundTasks, UploadFile
from redis import Redis
import rq
 
app = FastAPI()
redis_conn = Redis(host="redis", port=6379)
queue = rq.Queue("document_processing", connection=redis_conn)
 
@app.post("/documents/analyze")
async def analyze_document(
    file: UploadFile,
    client_id: str,
    contract_type: str,
    background_tasks: BackgroundTasks,
):
    file_path = await save_temp_file(file)
 
    job = queue.enqueue(
        process_and_analyze,
        file_path,
        client_id,
        contract_type,
        job_timeout=300,
    )
 
    return {"job_id": job.id, "status": "queued"}
 
@app.get("/documents/{job_id}/status")
async def get_analysis_status(job_id: str):
    job = rq.job.Job.fetch(job_id, connection=redis_conn)
    return {
        "status": job.get_status(),
        "result": job.result if job.is_finished else None,
    }

Deployed Solution

The agent runs on an AWS ECS cluster with auto-scaling based on the RQ queue depth. Pinecone hosts the vector indexes with a namespace per client for data isolation. Processed documents are encrypted at rest (AES-256) and in transit (TLS 1.3).

A Next.js dashboard allows lawyers to submit documents, track processing progress, validate extractions, and annotate cases where the agent made errors — these annotations feed a monthly fine-tuning cycle for the extraction model.

Results

Since deployment in August 2025, the agent has processed over 100,000 legal documents. The accuracy rate on standardized clause extraction reaches 97%, measured by human validation on a representative sample. Lawyers have reduced time spent on extraction and preliminary review tasks by 80%, now focusing on strategic analysis and client advisory work.