Skip to content

Examples

This page contains practical examples of using the BookWyrm client library.

Phrasal Analysis

Extract Phrases from Text

from typing import List, Union
from bookwyrm import BookWyrmClient
from bookwyrm.models import ResponseFormat, TextResult, TextSpanResult, PhraseProgressUpdate

# Create client
client: BookWyrmClient = BookWyrmClient()

text: str = """
Natural language processing (NLP) is a subfield of linguistics, computer science, 
and artificial intelligence concerned with the interactions between computers and human language.
"""

# Using function arguments (recommended)
phrases: List[TextSpanResult] = []
for response in client.stream_process_text(
    text=text,
    offsets=True,  # or response_format="with_offsets" or ResponseFormat.WITH_OFFSETS
):
    if isinstance(response, TextSpanResult):
        phrases.append(response)
        print(f"Phrase: {response.text}")
        print(f"Position: {response.start_char}-{response.end_char}")
    elif isinstance(response, PhraseProgressUpdate):
        print(f"Progress: {response.message}")

# phrases is now List[TextSpanResult] where each TextSpanResult has:
# - type: Literal["text_span"]
# - text: str (the phrase content)
# - start_char: int (starting character position)
# - end_char: int (ending character position)

Create Phrasal Text Chunks

from typing import List

# Example text with multiple sentences
text: str = """Natural language processing enables computers to understand human language. 
Machine learning algorithms power these systems. Deep learning has revolutionized the field. 
Modern NLP applications include chatbots, translation, and sentiment analysis."""

# Create phrasal chunks bounded by size - fit as many complete phrases as possible
chunks: List[TextSpanResult] = []
for response in client.stream_process_text(
    text=text,
    chunk_size=125,  # Bounded by 125 characters per chunk (smaller for demo)
    offsets=True  # boolean flag for WITH_OFFSETS
):
    if isinstance(response, TextSpanResult):
        chunks.append(response)

print(f"Created {len(chunks)} phrasal chunks")

# Example output for the above text:
# Chunk 1: "Natural language processing enables computers to understand human language. Machine learning algorithms power these systems."
# Chunk 2: "Deep learning has revolutionized the field. Modern NLP applications include chatbots, translation, and sentiment analysis."

# Each chunk is bounded by the chunk size but fits as many complete phrases/sentences as possible up to that limit
# chunks is now List[TextSpanResult] where each phrasal chunk has:
# - type: Literal["text_span"]
# - text: str (the chunk content containing multiple phrases)
# - start_char: int (starting character position)
# - end_char: int (ending character position)

Process Text from URL

from typing import TextIO

# Save to JSONL file
with open("alice_phrases.jsonl", "w") as f:
    f: TextIO
    for response in client.stream_process_text(
        text_url="https://www.gutenberg.org/files/11/11-0.txt",  # Alice in Wonderland
        chunk_size=2000,
        text_only=True  # boolean flag for TEXT_ONLY
    ):
        if isinstance(response, TextSpanResult):
            f.write(response.model_dump_json() + "\n")

Citation Finding

Basic Citation Finding

from typing import List
from bookwyrm.models import TextSpan, CitationResponse, Citation

# Prepare text chunks (you can get these from phrasal analysis above)
chunks: List[TextSpan] = [
    TextSpan(
        text="Climate change refers to long-term shifts in global temperatures and weather patterns.",
        start_char=0,
        end_char=89
    ),
    TextSpan(
        text="The primary cause is human activities, particularly fossil fuel burning.",
        start_char=90,
        end_char=161
    ),
    TextSpan(
        text="This releases greenhouse gases like CO2 into the atmosphere.",
        start_char=162,
        end_char=222
    )
]

# Find citations using streaming (the only available method)
citations: List[Citation] = []
for stream_response in client.stream_citations(
    chunks=chunks,
    question="What causes climate change?"
):
    if hasattr(stream_response, 'citation'):
        citations.append(stream_response.citation)
    elif hasattr(stream_response, 'total_citations'):
        print(f"Found {stream_response.total_citations} citations total")

print(f"Found {len(citations)} citations:")
citation: Citation
for citation in citations:
    print(f"- Quality {citation.quality}/4: {citation.text}")

# citations is List[Citation] where each Citation has:
# - start_chunk: int (inclusive)
# - end_chunk: int (inclusive) 
# - text: str (the citation content)
# - reasoning: str (why it's relevant)
# - quality: int (0-4 scale, 4=best)

Streaming Citations with Progress

from typing import List
from rich.progress import Progress, SpinnerColumn, TextColumn, TaskID
from bookwyrm.models import (
    CitationProgressUpdate, 
    CitationStreamResponse, 
    CitationSummaryResponse, 
    CitationErrorResponse,
    Citation
)

with Progress(SpinnerColumn(), TextColumn("{task.description}")) as progress:
    task: TaskID = progress.add_task("Finding citations...", total=None)

    citations: List[Citation] = []
    for update in client.stream_citations(
        chunks=chunks,
        question="What causes climate change?"
    ):
        if isinstance(update, CitationProgressUpdate):
            progress.update(task, description=update.message)
        elif isinstance(update, CitationStreamResponse):
            citations.append(update.citation)
            print(f"Found: {update.citation.text[:50]}...")
        elif isinstance(update, CitationSummaryResponse):
            print(f"Complete: {update.total_citations} citations found")
        elif isinstance(update, CitationErrorResponse):
            print(f"Error: {update.error}")

print(f"Total citations found: {len(citations)}")

# update can be one of:
# - CitationProgressUpdate: type="progress", message: str, chunks_processed: int, etc.
# - CitationStreamResponse: type="citation", citation: Citation
# - CitationSummaryResponse: type="summary", total_citations: int, usage: UsageInfo
# - CitationErrorResponse: type="error", error: str
#
# citations is List[Citation] (same structure as non-streaming)

Using JSONL Files

# Load from JSONL file
citations: List[Citation] = []
for stream_response in client.stream_citations(
    jsonl_url="https://example.com/chunks.jsonl",
    question="What is machine learning?",
    start=0,
    limit=100
):
    if hasattr(stream_response, 'citation'):
        citations.append(stream_response.citation)

PDF Extraction

Extract Text from PDF

from typing import BinaryIO
from bookwyrm.models import PDFPage, PDFTextElement

# Load PDF file using raw bytes (recommended)
with open("document.pdf", "rb") as f:
    f: BinaryIO
    pdf_bytes: bytes = f.read()

pages: List[PDFPage] = []
for response in client.stream_extract_pdf(
    pdf_bytes=pdf_bytes,
    filename="document.pdf"
):
    if hasattr(response, 'page_data'):
        pages.append(response.page_data)
    elif hasattr(response, 'total_pages') and hasattr(response, 'type') and response.type == "metadata":
        print(f"Starting extraction of {response.total_pages} pages")

print(f"Extracted {len(pages)} pages")
page: PDFPage
for page in pages:
    print(f"Page {page.page_number}: {len(page.text_blocks)} text elements")
    element: PDFTextElement
    for element in page.text_blocks[:3]:  # Show first 3 elements
        print(f"  - {element.text[:50]}...")

# pages is List[PDFPage] where each PDFPage has:
# - page_number: int (1-based)
# - text_blocks: List[PDFTextElement]
# - tables: List[dict] (placeholder)
# - images: List[dict] (placeholder)
#
# Each PDFPage has:
# - page_number: int (1-based)
# - text_blocks: List[PDFTextElement]
# - tables: List[dict] (placeholder)
# - images: List[dict] (placeholder)
#
# Each PDFTextElement has:
# - text: str (extracted text)
# - confidence: float (0.0-1.0 OCR confidence)
# - bbox: List[List[float]] (raw polygon coordinates)
# - coordinates: PDFBoundingBox (x1, y1, x2, y2 rectangle)

Stream PDF Extraction with Progress

from typing import List
from rich.progress import Progress, BarColumn, TaskProgressColumn, TaskID
from bookwyrm.models import (
    PDFStreamMetadata, 
    PDFStreamPageResponse, 
    PDFStreamPageError,
    PDFStreamComplete,
    PDFStreamError,
    PDFPage
)

pages: List[PDFPage] = []
with Progress(BarColumn(), TaskProgressColumn()) as progress:
    task: TaskID = progress.add_task("Extracting PDF...", total=100)

    for response in client.stream_extract_pdf(
        pdf_url="https://example.com/document.pdf",
        start_page=1,
        num_pages=10
    ):
        if isinstance(response, PDFStreamMetadata):
            progress.update(task, total=response.total_pages)
        elif isinstance(response, PDFStreamPageResponse):
            pages.append(response.page_data)
            progress.update(task, completed=response.current_page)
        elif isinstance(response, PDFStreamPageError):
            print(f"Error on page {response.document_page}: {response.error}")
        elif isinstance(response, PDFStreamComplete):
            print("PDF extraction completed")
        elif isinstance(response, PDFStreamError):
            print(f"Extraction error: {response.error}")

print(f"Extracted {len(pages)} pages")

# response can be one of:
# - PDFStreamMetadata: type="metadata", total_pages: int, start_page: int, etc.
# - PDFStreamPageResponse: type="page", page_data: PDFPage, document_page: int
# - PDFStreamPageError: type="page_error", error: str, document_page: int
# - PDFStreamComplete: type="complete", current_page: int
# - PDFStreamError: type="error", error: str
#
# pages is List[PDFPage] (same structure as non-streaming)

File Classification

Classify File Content

from typing import BinaryIO, Any
from bookwyrm.models import ClassifyResponse

# Read file as binary
with open("unknown_file.dat", "rb") as f:
    f: BinaryIO
    file_bytes: bytes = f.read()

# Classify using raw bytes (recommended)
response: ClassifyResponse = client.classify(
    content_bytes=file_bytes,
    filename="unknown_file.dat"
)

print(f"Format: {response.classification.format_type}")
print(f"Content Type: {response.classification.content_type}")
print(f"MIME Type: {response.classification.mime_type}")
print(f"Confidence: {response.classification.confidence:.2%}")

if response.classification.details:
    print("Details:")
    key: str
    value: Any
    for key, value in response.classification.details.items():
        print(f"  {key}: {value}")

# response is ClassifyResponse with:
# - classification: FileClassification
# - file_size: int (bytes)
# - sample_preview: Optional[str] (first few chars if text)
#
# FileClassification has:
# - format_type: str (e.g., "text", "image", "binary")
# - content_type: str (e.g., "python_code", "jpeg_image")
# - mime_type: str (e.g., "text/plain", "image/jpeg")
# - confidence: float (0.0-1.0)
# - details: dict (encoding, language, etc.)
# - classification_methods: Optional[List[str]]

Text Summarization

Basic Text Summarization

from typing import List
from bookwyrm.models import SummaryResponse, TextSpan

# Summarize from plain text
text: str = """
Natural language processing (NLP) is a subfield of linguistics, computer science, 
and artificial intelligence concerned with the interactions between computers and human language.
In particular, how to program computers to process and analyze large amounts of natural language data.
The goal is a computer capable of understanding the contents of documents, including 
the contextual nuances of the language within them. The technology can then accurately 
extract information and insights contained in the documents as well as categorize and 
organize the documents themselves.
"""

# Summarize using streaming (the only available method)
final_summary: SummaryResponse = None
for response in client.stream_summarize(
    content=text,
    max_tokens=5000
):
    if hasattr(response, 'summary'):
        final_summary = response
        break
    elif hasattr(response, 'message'):
        print(f"Progress: {response.message}")

if final_summary:
    print("Summary:")
    print(final_summary.summary)
    print(f"\nProcessed {final_summary.total_tokens} tokens across {final_summary.levels_used} levels")

# final_summary is SummaryResponse with:
# - type: Literal["summary"]
# - summary: str (the final summary text)
# - subsummary_count: int (number of intermediate summaries)
# - levels_used: int (hierarchical levels used)
# - total_tokens: int (total tokens processed)
# - intermediate_summaries: Optional[List[List[str]]] (debug info if requested)

Structured Summarization with Pydantic Models

BookWyrm supports structured output using custom Pydantic models, allowing you to extract specific information in a consistent JSON format.

import json
from typing import Optional, List
from datetime import date
from pydantic import BaseModel, Field
from bookwyrm.models import SummaryResponse, ModelStrength

# Define a custom Pydantic model for structured output
class BookSummary(BaseModel):
    """Structured summary model for books and literary works."""

    title: Optional[str] = Field(
        None,
        description="The title of the book or literary work"
    )

    author: Optional[str] = Field(
        None, 
        description="The author or authors of the work"
    )

    genre: Optional[str] = Field(
        None,
        description="The literary genre (fiction, non-fiction, mystery, etc.)"
    )

    main_themes: Optional[List[str]] = Field(
        None,
        description="List of major themes explored in the work"
    )

    plot_summary: Optional[str] = Field(
        None,
        description="Comprehensive plot summary including key events and resolution"
    )

    main_characters: Optional[List[str]] = Field(
        None,
        description="List of primary characters in the story"
    )

# Literary text to analyze
literary_text: str = """
Pride and Prejudice by Jane Austen follows Elizabeth Bennet, a witty and independent 
young woman in Regency England. When the proud Mr. Darcy arrives in her neighborhood, 
Elizabeth initially dislikes him due to his apparent arrogance. However, as she learns 
more about his true character and his acts of kindness, she realizes her prejudice was 
unfounded. The novel explores themes of love, marriage, social class, and personal growth 
as Elizabeth and Darcy overcome their initial impressions to find true love.
"""

# Convert model to JSON schema
model_schema = json.dumps(BookSummary.model_json_schema())

# Perform structured summarization
final_summary: SummaryResponse = None
for response in client.stream_summarize(
    content=literary_text,
    model_strength=ModelStrength.SMART,  # Use smart model for better structured output
    model_name="BookSummary",
    model_schema_json=model_schema,
    max_tokens=3000
):
    if hasattr(response, 'summary'):
        final_summary = response
        break
    elif hasattr(response, 'message'):
        print(f"Progress: {response.message}")

if final_summary:
    # Parse the structured JSON output
    try:
        structured_data = json.loads(final_summary.summary)
        book_summary = BookSummary.model_validate(structured_data)

        print("Structured Book Summary:")
        print(f"Title: {book_summary.title}")
        print(f"Author: {book_summary.author}")
        print(f"Genre: {book_summary.genre}")
        print(f"Main Themes: {', '.join(book_summary.main_themes or [])}")
        print(f"Plot: {book_summary.plot_summary}")
        print(f"Characters: {', '.join(book_summary.main_characters or [])}")

    except json.JSONDecodeError:
        print("Raw summary (JSON parsing failed):")
        print(final_summary.summary)

# The structured output will be JSON conforming to your Pydantic model:
# {
#   "title": "Pride and Prejudice",
#   "author": "Jane Austen", 
#   "genre": "Romance/Social Commentary",
#   "main_themes": ["Love", "Marriage", "Social Class", "Personal Growth", "Prejudice"],
#   "plot_summary": "Elizabeth Bennet initially dislikes the proud Mr. Darcy...",
#   "main_characters": ["Elizabeth Bennet", "Mr. Darcy", "Jane Bennet", "Mr. Bingley"]
# }

Advanced Structured Models

# Scientific paper analysis model
class ScientificPaper(BaseModel):
    """Model for analyzing scientific papers."""

    title: Optional[str] = Field(None, description="The paper's title")
    authors: Optional[List[str]] = Field(None, description="List of author names")
    abstract: Optional[str] = Field(None, description="The paper's abstract")
    methodology: Optional[str] = Field(None, description="Research methods used")
    key_findings: Optional[List[str]] = Field(None, description="Main research findings")
    conclusions: Optional[str] = Field(None, description="Authors' conclusions")
    keywords: Optional[List[str]] = Field(None, description="Key scientific terms and concepts")

# Business document analysis model  
class BusinessDocument(BaseModel):
    """Model for analyzing business documents."""

    document_type: Optional[str] = Field(None, description="Type of business document")
    key_metrics: Optional[List[str]] = Field(None, description="Important numbers or KPIs")
    action_items: Optional[List[str]] = Field(None, description="Tasks or actions to be taken")
    stakeholders: Optional[List[str]] = Field(None, description="People or organizations involved")
    deadlines: Optional[List[str]] = Field(None, description="Important dates or deadlines")
    budget_info: Optional[str] = Field(None, description="Budget or financial information")

# Use any model with the same pattern
scientific_text = "Research paper content here..."
model_schema = json.dumps(ScientificPaper.model_json_schema())

for response in client.stream_summarize(
    content=scientific_text,
    model_name="ScientificPaper",
    model_schema_json=model_schema,
    model_strength=ModelStrength.WISE  # Use wise for complex analysis
):
    if hasattr(response, 'summary'):
        structured_result = json.loads(response.summary)
        paper = ScientificPaper.model_validate(structured_result)
        print(f"Paper: {paper.title}")
        print(f"Authors: {', '.join(paper.authors or [])}")
        break

Custom Prompts for Specialized Analysis

# Alternative to Pydantic models: use custom prompts
final_summary: SummaryResponse = None
for response in client.stream_summarize(
    content=literary_text,
    chunk_prompt="Extract key literary elements: themes, character development, plot structure, and writing style",
    summary_of_summaries_prompt="Create a comprehensive literary analysis focusing on narrative techniques, character arcs, and thematic significance",
    model_strength=ModelStrength.CLEVER,
    max_tokens=4000
):
    if hasattr(response, 'summary'):
        final_summary = response
        break

if final_summary:
    print("Literary Analysis:")
    print(final_summary.summary)

Summarize from URL

final_summary: SummaryResponse = None
for response in client.stream_summarize(
    url="https://www.gutenberg.org/files/11/11-0.txt",  # Alice in Wonderland
    max_tokens=10000,
    debug=True  # Include intermediate summaries
):
    if hasattr(response, 'summary'):
        final_summary = response
        break
    elif hasattr(response, 'message'):
        print(f"Progress: {response.message}")

if final_summary:
    print("Final Summary:")
    print(final_summary.summary)

    if final_summary.intermediate_summaries:
        print(f"\nDebug: {len(final_summary.intermediate_summaries)} levels of summaries")
        for level, summaries in enumerate(final_summary.intermediate_summaries):
            print(f"Level {level + 1}: {len(summaries)} summaries")

Summarize from Phrases

# Use phrases from previous phrasal analysis
phrases: List[TextSpan] = [
    TextSpan(text="Machine learning is a subset of AI.", start_char=0, end_char=38),
    TextSpan(text="It uses algorithms to learn from data.", start_char=39, end_char=77),
    TextSpan(text="Deep learning uses neural networks.", start_char=78, end_char=113),
    # ... more phrases
]

final_summary: SummaryResponse = None
for response in client.stream_summarize(
    phrases=phrases,
    max_tokens=2000
):
    if hasattr(response, 'summary'):
        final_summary = response
        break
    elif hasattr(response, 'message'):
        print(f"Progress: {response.message}")

if final_summary:
    print(final_summary.summary)

Model Strength Selection

from bookwyrm.models import ModelStrength

# Different model strengths for different use cases
model_strengths = {
    ModelStrength.SWIFT: "Fast processing for quick results",
    ModelStrength.SMART: "Intelligent analysis with good quality", 
    ModelStrength.CLEVER: "Advanced reasoning capabilities",
    ModelStrength.WISE: "High-quality analysis for important content",
    ModelStrength.BRAINIAC: "Maximum sophistication for complex tasks"
}

# Choose based on your needs
for response in client.stream_summarize(
    content=complex_academic_text,
    model_strength=ModelStrength.WISE,  # High quality for academic content
    max_tokens=8000
):
    if hasattr(response, 'summary'):
        print("High-quality academic summary:")
        print(response.summary)
        break

# For quick testing or simple content
for response in client.stream_summarize(
    content=simple_text,
    model_strength=ModelStrength.SWIFT,  # Fast for simple content
    max_tokens=2000
):
    if hasattr(response, 'summary'):
        print("Quick summary:")
        print(response.summary)
        break

Async Usage

Using AsyncBookWyrmClient

import asyncio
from bookwyrm import AsyncBookWyrmClient
from bookwyrm.models import CitationResponse, CitationStreamResponse

async def main() -> None:
    client: AsyncBookWyrmClient
    async with AsyncBookWyrmClient() as client:
        # Async citation finding (streaming only)
        citations: List[Citation] = []
        async for stream_response in client.stream_citations(
            chunks=chunks,
            question="What causes climate change?"
        ):
            if hasattr(stream_response, 'citation'):
                citations.append(stream_response.citation)
            elif hasattr(stream_response, 'total_citations'):
                print(f"Found {stream_response.total_citations} citations")

        print(f"Found {len(citations)} citations")

        # Async streaming
        async for update in client.stream_citations(
            chunks=chunks,
            question="What causes climate change?"
        ):
            if isinstance(update, CitationStreamResponse):
                print(f"Citation: {update.citation.text}")

# Run async code
asyncio.run(main())

# All async methods return the same types as their sync counterparts:
# - stream_citations() -> AsyncIterator[StreamingCitationResponse]
# - stream_summarize() -> AsyncIterator[StreamingSummarizeResponse]
# - stream_process_text() -> AsyncIterator[StreamingPhrasalResponse]
# - classify() -> ClassifyResponse
# - extract_pdf() -> PDFExtractResponse
# - stream_extract_pdf() -> AsyncIterator[StreamingPDFResponse]

Error Handling

Comprehensive Error Handling

from typing import Optional
from bookwyrm.client import BookWyrmAPIError, BookWyrmClientError
from bookwyrm.models import CitationResponse

try:
    citations: List[Citation] = []
    for stream_response in client.stream_citations(
        chunks=chunks,
        question="What causes climate change?"
    ):
        if hasattr(stream_response, 'citation'):
            citations.append(stream_response.citation)
except BookWyrmAPIError as e:
    status_code: Optional[int] = e.status_code
    if status_code == 401:
        print("Authentication failed - check your API key")
    elif status_code == 429:
        print("Rate limit exceeded - please wait")
    elif status_code == 500:
        print("Server error - please try again later")
    else:
        print(f"API error: {e}")
except BookWyrmClientError as e:
    print(f"Client error: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")