Merge pull request #65 from lfnovo/update_frameworks

Replace the content extraction graph with content-core
2025-05-31 15:15:12 -03:00 · 2025-05-31 15:15:12 -03:00 · 858b5e0d6e
commit 858b5e0d6e
parent 6ec531700b c876aeff53
24 changed files with 2400 additions and 2352 deletions
--- a/.env.example
+++ b/.env.example
@ -9,7 +9,7 @@ OPENAI_API_KEY=

 # GEMINI
 # this is the best model for long context and podcast generation
-# GEMINI_API_KEY=
+# GOOGLE_API_KEY=

 # VERTEXAI
 # VERTEX_PROJECT=my-google-cloud-project-name
@ -41,7 +41,7 @@ ELEVENLABS_API_KEY=

 # CONNECTION DETAILS FOR YOUR SURREAL DB
 # Use surrealdb if using docker-compose or add your server ip if using a different setup
-SURREAL_ADDRESS="surrealdb"
+SURREAL_ADDRESS="localhost"
 SURREAL_PORT=8000
 SURREAL_USER="root"
 SURREAL_PASS="root"
@ -57,3 +57,10 @@ SUMMARY_CHUNK_OVERLAP=1000
 # It is measured in characters, not tokens.
 EMBEDDING_CHUNK_SIZE=1000
 EMBEDDING_CHUNK_OVERLAP=50
+
+
+# FIRECRAWL - Get a key at https://firecrawl.dev/
+FIRECRAWL_API_KEY=
+
+# JINA - Get a key at https://jina.ai/
+JINA_API_KEY=
--- a/.gitignore
+++ b/.gitignore
@ -118,3 +118,4 @@ desktop.ini
 *.db
 *.sqlite3

+.quarentena
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,9 +0,0 @@
-repos:
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.4
-    hooks:
-      - id: ruff
-        args: ["--fix"]
-        exclude: "templates"
-      - id: ruff-format
-        exclude: "templates"
--- a/README.md
+++ b/README.md
@ -152,76 +152,21 @@ Go to the [Usage](docs/USAGE.md) page to learn how to use all features.
 ## Features

 - **Multi-Notebook Support**: Organize your research across multiple notebooks effortlessly.
- **Multi-model support**: Open AI, Anthropic, Gemini, Vertex AI, Open Router, Ollama.
+- **Multi-model support**: Open AI, Anthropic, Gemini, Vertex AI, Open Router, X.AI, Groq,Ollama.
 - **Podcast Generator**: Automatically convert your notes into a podcast format.
 - **Broad Content Integration**: Works with links, PDFs, EPUB, Office, TXT, Markdown files, YouTube videos, Audio files, Video files and pasted text.
+- **Content Transformation**: Powerful customizable actions to summarize, extract insights, and more.
 - **AI-Powered Notes**: Write notes yourself or let the AI assist you in generating insights.
 - **Integrated Search Engines**: Built-in full-text and vector search for faster information retrieval.
 - **Fine-Grained Context Management**: Choose exactly what to share with the AI to maintain control.
+- **Citations**: Ask questions about your documents and get answers with citations.

-<p align="right">(<a href="#readme-top">back to top</a>)</p>
-
-## 🚀 New Features
-
-### v0.1 - Release Candidate
-
- Better citations and improved search capabilities
- The "Ask" feature is much smarter now and let's you check its thinking
- Enabled support for X.AI and Groq models
- Select default transformations to apply to all content
- Save insights as custom notes
- Items are added to context by default
-
-### v0.0.10 - Gemini podcast model
-
- Added the Gemini model for generating much more fluid and engaging podcasts
-
-### v0.0.9 - Ask your Documents and Citations  ❓
-
- Ask questions about your documents and get answers with citations
-
-### v0.0.7 - Model Management  🗂️
-
- Manage your AI models and providers in a single interface
- Define default models for several tasks such as chat, transformation, embedding, etc
- Enabled support for Embedding models from Gemini, Vertex and Ollama
-
-### v0.0.6 - ePub and Office files support 📄
-
-You can now process ePub and Office files (Word, Excel, PowerPoint), extracting text and insights from them. Perfect for books, reports, presentations, and more.
-
-### v0.0.5 - Audio and Video support 📽️
-
-You can now process audio and video files, extracting transcripts and insights from them. Perfect for podcasts, interviews, lectures, and more.
-
-### v0.0.4 - Podcasts  🎙️
-
-You can now build amazing custom podcasts based on your own data. Customize your speakers, episode structure, cadence, voices, etc. 
-
-Check out a sample using my own voice created on Eleven Labs and a interview format. 

 [![Check out our podcast sample](https://img.youtube.com/vi/D-760MlGwaI/0.jpg)](https://www.youtube.com/watch?v=D-760MlGwaI)

-You can generate your podcast in dozens of languages.
+<p align="right">(<a href="#readme-top">back to top</a>)</p>

-Head to the [Podcasts](docs/PODCASTS.md) page for more info

-### v0.0.3 - Transformations ✨
-
-We just release a much more powerful way to create more value from your sources.
-Transformations enable you do extract an unlimited amount of insights from your content.
-It's 100% customizable and you can extend it to your own needs, like Paper Analysis, Article Writing, etc.
-
-Head to the [Transformations](docs/TRANSFORMATIONS.md) page for more info
-
-### v0.0.2 - Several new providers are supported now:
-
- OpenAI
- Anthropic
- Open Router
- LiteLLM
- Vertex AI
- Ollama

 ### 📝 Notebook Page

@ -268,7 +213,6 @@ See the [open issues](https://github.com/lfnovo/open-notebook/issues) for a full
 <p align="right">(<a href="#readme-top">back to top</a>)</p>


-
 <!-- CONTRIBUTING -->
 ## Contributing

@ -307,6 +251,9 @@ Join our [Discord server](https://discord.gg/37XJPXfz2w) for help, share workflo
 This project uses some amazing third-party libraries

 * [Podcastfy](https://github.com/souzatharsis/podcastfy) - Licensed under the Apache License 2.0
+* [Content Core](https://github.com/lfnovo/content-core) - Licensed under the MIT License
+* [Docling](https://github.com/docling-project/docling) - Licensed under the MIT License
+* [Esperanto](https://github.com/lfnovo/esperanto) - Licensed under the MIT License

 <p align="right">(<a href="#readme-top">back to top</a>)</p>

--- a/open_notebook/domain/content_settings.py
+++ b/open_notebook/domain/content_settings.py
@ -0,0 +1,21 @@
+from typing import ClassVar, Literal, Optional
+
+from pydantic import Field
+
+from open_notebook.domain.base import RecordModel
+
+
+class ContentSettings(RecordModel):
+    record_id: ClassVar[str] = "open_notebook:content_settings"
+    default_content_processing_engine_doc: Optional[
+        Literal["auto", "docling", "simple"]
+    ] = Field("auto", description="Default Content Processing Engine for Documents")
+    default_content_processing_engine_url: Optional[
+        Literal["auto", "firecrawl", "jina", "simple"]
+    ] = Field("auto", description="Default Content Processing Engine for URLs")
+    default_embedding_option: Optional[Literal["ask", "always", "never"]] = Field(
+        "ask", description="Default Embedding Option for Vector Search"
+    )
+    auto_delete_files: Optional[Literal["yes", "no"]] = Field(
+        "yes", description="Auto Delete Uploaded Files"
+    )
--- a/open_notebook/graphs/content_processing/init.py
+++ b/open_notebook/graphs/content_processing/init.py
@ -1,145 +0,0 @@
-import os
-from typing import Any, Dict
-
-import magic
-from langgraph.graph import END, START, StateGraph
-from loguru import logger
-
-from open_notebook.exceptions import UnsupportedTypeException
-from open_notebook.graphs.content_processing.audio import extract_audio
-from open_notebook.graphs.content_processing.office import (
-    SUPPORTED_OFFICE_TYPES,
-    extract_office_content,
-)
-from open_notebook.graphs.content_processing.pdf import (
-    SUPPORTED_FITZ_TYPES,
-    extract_pdf,
-)
-from open_notebook.graphs.content_processing.state import ContentState
-from open_notebook.graphs.content_processing.text import extract_txt
-from open_notebook.graphs.content_processing.url import extract_url, url_provider
-from open_notebook.graphs.content_processing.video import extract_best_audio_from_video
-from open_notebook.graphs.content_processing.youtube import extract_youtube_transcript
-
-
-async def source_identification(state: ContentState) -> Dict[str, str]:
-    """
-    Identify the content source based on parameters
-    """
-    if state.get("content"):
-        doc_type = "text"
-    elif state.get("file_path"):
-        doc_type = "file"
-    elif state.get("url"):
-        doc_type = "url"
-    else:
-        raise ValueError("No source provided.")
-
-    return {"source_type": doc_type}
-
-
-async def file_type(state: ContentState) -> Dict[str, Any]:
-    """
-    Identify the file using python-magic
-    """
-    return_dict = {}
-    file_path = state.get("file_path")
-    if file_path is not None:
-        return_dict["identified_type"] = magic.from_file(file_path, mime=True)
-        return_dict["title"] = os.path.basename(file_path)
-    return return_dict
-
-
-async def file_type_edge(data: ContentState) -> str:
-    assert data.get("identified_type"), "Type not identified"
-    identified_type = data["identified_type"]
-
-    if identified_type == "text/plain":
-        return "extract_txt"
-    elif identified_type in SUPPORTED_FITZ_TYPES:
-        return "extract_pdf"
-    elif identified_type in SUPPORTED_OFFICE_TYPES:
-        return "extract_office_content"
-    elif identified_type.startswith("video"):
-        return "extract_best_audio_from_video"
-    elif identified_type.startswith("audio"):
-        return "extract_audio"
-    else:
-        raise UnsupportedTypeException(
-            f"Unsupported file type: {data.get('identified_type')}"
-        )
-
-
-async def delete_file(data: ContentState) -> Dict[str, Any]:
-    if data.get("delete_source"):
-        logger.debug(f"Deleting file: {data.get('file_path')}")
-        file_path = data.get("file_path")
-        if file_path is not None:
-            try:
-                os.remove(file_path)
-                return {"file_path": None}
-            except FileNotFoundError:
-                logger.warning(f"File not found while trying to delete: {file_path}")
-    else:
-        logger.debug("Not deleting file")
-    return {}
-
-
-async def url_type_router(x: ContentState) -> str:
-    return x.get("identified_type", "")
-
-
-async def source_type_router(x: ContentState) -> str:
-    return x.get("source_type", "")
-
-
-# Create workflow
-workflow = StateGraph(ContentState)
-
-# Add nodes
-workflow.add_node("source", source_identification)
-workflow.add_node("url_provider", url_provider)
-workflow.add_node("file_type", file_type)
-workflow.add_node("extract_txt", extract_txt)
-workflow.add_node("extract_pdf", extract_pdf)
-workflow.add_node("extract_url", extract_url)
-workflow.add_node("extract_office_content", extract_office_content)
-workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video)
-workflow.add_node("extract_audio", extract_audio)
-workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
-workflow.add_node("delete_file", delete_file)
-
-# Add edges
-workflow.add_edge(START, "source")
-workflow.add_conditional_edges(
-    "source",
-    source_type_router,
-    {
-        "url": "url_provider",
-        "file": "file_type",
-        "text": END,
-    },
-)
-workflow.add_conditional_edges(
-    "file_type",
-    file_type_edge,
-)
-workflow.add_conditional_edges(
-    "url_provider",
-    url_type_router,
-    {"article": "extract_url", "youtube": "extract_youtube_transcript"},
-)
-workflow.add_edge("url_provider", END)
-workflow.add_edge("file_type", END)
-workflow.add_edge("extract_url", END)
-workflow.add_edge("extract_txt", END)
-workflow.add_edge("extract_youtube_transcript", END)
-
-workflow.add_edge("extract_pdf", "delete_file")
-workflow.add_edge("extract_office_content", "delete_file")
-workflow.add_edge("extract_best_audio_from_video", "extract_audio")
-workflow.add_edge("extract_audio", "delete_file")
-workflow.add_edge("delete_file", END)
-
-# Compile graph
-graph = workflow.compile()
--- a/open_notebook/graphs/content_processing/audio.py
+++ b/open_notebook/graphs/content_processing/audio.py
@ -1,114 +0,0 @@
-import asyncio
-import os
-from functools import partial
-from math import ceil
-
-from loguru import logger
-from pydub import AudioSegment
-
-from open_notebook.domain.models import model_manager
-from open_notebook.graphs.content_processing.state import ContentState
-
-# todo: remove reference to model_manager
-# future: parallelize the transcription process
-
-
-async def split_audio(input_file, segment_length_minutes=15, output_prefix=None):
-    """
-    Split an audio file into segments asynchronously.
-    """
-
-    def _split(input_file, segment_length_minutes, output_prefix):
-        # Convert input file to absolute path
-        input_file_abs = os.path.abspath(input_file)
-        output_dir = os.path.dirname(input_file_abs)
-        os.makedirs(output_dir, exist_ok=True)
-
-        # Set up output prefix
-        if output_prefix is None:
-            output_prefix = os.path.splitext(os.path.basename(input_file_abs))[0]
-
-        # Load the audio file
-        audio = AudioSegment.from_file(input_file_abs)
-
-        # Calculate segment length in milliseconds
-        segment_length_ms = segment_length_minutes * 60 * 1000
-
-        # Calculate number of segments
-        total_segments = ceil(len(audio) / segment_length_ms)
-        logger.debug(f"Splitting file: {input_file_abs} into {total_segments} segments")
-
-        output_files = []
-
-        # Split the audio into segments
-        for i in range(total_segments):
-            start_time = i * segment_length_ms
-            end_time = min((i + 1) * segment_length_ms, len(audio))
-
-            # Extract segment
-            segment = audio[start_time:end_time]
-
-            # Generate output filename
-            output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
-            output_path = os.path.join(output_dir, output_filename)
-
-            # Export segment
-            segment.export(output_path, format="mp3")
-            output_files.append(output_path)
-
-            logger.debug(f"Exported segment {i+1}/{total_segments}: {output_filename}")
-
-        return output_files
-
-    # Run CPU-bound audio processing in thread pool
-    return await asyncio.get_event_loop().run_in_executor(
-        None, partial(_split, input_file, segment_length_minutes, output_prefix)
-    )
-
-
-async def transcribe_audio_segment(audio_file, model):
-    """Transcribe a single audio segment asynchronously"""
-
-    def _transcribe(audio_file, model):
-        return model.transcribe(audio_file)
-
-    return await asyncio.get_event_loop().run_in_executor(
-        None, partial(_transcribe, audio_file, model)
-    )
-
-
-async def extract_audio(data: ContentState):
-    SPEECH_TO_TEXT_MODEL = model_manager.speech_to_text
-    input_audio_path = data.get("file_path")
-    audio_files = []
-
-    try:
-        # Split audio into segments
-        audio_files = await split_audio(input_audio_path)
-
-        # Transcribe all segments concurrently
-        transcribe_tasks = [
-            transcribe_audio_segment(audio_file, SPEECH_TO_TEXT_MODEL)
-            for audio_file in audio_files
-        ]
-        transcriptions = await asyncio.gather(*transcribe_tasks)
-
-        return {"content": " ".join(transcriptions)}
-
-    except Exception as e:
-        logger.error(f"Error transcribing audio: {str(e)}")
-        logger.exception(e)
-        raise
-
-    finally:
-        # Clean up temporary files
-        def _cleanup(files):
-            for file in files:
-                try:
-                    os.remove(file)
-                except OSError as e:
-                    logger.error(f"Error removing temporary file {file}: {str(e)}")
-
-        await asyncio.get_event_loop().run_in_executor(
-            None, partial(_cleanup, audio_files)
-        )
--- a/open_notebook/graphs/content_processing/office.py
+++ b/open_notebook/graphs/content_processing/office.py
@ -1,323 +0,0 @@
-import asyncio
-from functools import partial
-
-from docx import Document
-from loguru import logger
-from openpyxl import load_workbook
-from pptx import Presentation
-
-from open_notebook.graphs.content_processing.state import ContentState
-
-SUPPORTED_OFFICE_TYPES = [
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-]
-
-
-async def extract_docx_content_detailed(file_path):
-    """Extract content from DOCX file"""
-
-    def _extract():
-        try:
-            doc = Document(file_path)
-            content = []
-
-            for paragraph in doc.paragraphs:
-                if not paragraph.text.strip():
-                    continue
-
-                style = paragraph.style.name if paragraph.style else "Normal"
-                text = paragraph.text.strip()
-
-                # Get paragraph formatting
-                p_format = paragraph.paragraph_format
-                indent = p_format.left_indent or 0
-
-                # Convert indent to spaces (1 level = 4 spaces)
-                indent_level = 0
-                if hasattr(indent, "pt"):
-                    indent_level = int(indent.pt / 72)  # 72 points = 1 inch
-                indent_spaces = " " * (indent_level * 4)
-
-                # Handle different types of formatting
-                if "Heading" in style:
-                    level = style[-1] if style[-1].isdigit() else "1"
-                    heading_marks = "#" * int(level)
-                    content.append(f"\n{heading_marks} {text}\n")
-
-                # Handle bullet points
-                elif (
-                    paragraph.style
-                    and hasattr(paragraph.style, "name")
-                    and paragraph.style.name.startswith("List")
-                ):
-                    # Numbered list
-                    if (
-                        hasattr(paragraph._p, "pPr")
-                        and paragraph._p.pPr is not None
-                        and hasattr(paragraph._p.pPr, "numPr")
-                        and paragraph._p.pPr.numPr is not None
-                    ):
-                        # Try to get the actual number
-                        try:
-                            if (
-                                hasattr(paragraph._p.pPr.numPr, "numId")
-                                and paragraph._p.pPr.numPr.numId is not None
-                                and hasattr(paragraph._p.pPr.numPr.numId, "val")
-                            ):
-                                number = paragraph._p.pPr.numPr.numId.val
-                                content.append(f"{indent_spaces}{number}. {text}")
-                            else:
-                                content.append(f"{indent_spaces}1. {text}")
-                        except Exception:
-                            content.append(f"{indent_spaces}1. {text}")
-                    # Bullet list
-                    else:
-                        content.append(f"{indent_spaces}* {text}")
-
-                else:
-                    # Handle text formatting
-                    formatted_text = []
-                    for run in paragraph.runs:
-                        if run.bold:
-                            formatted_text.append(f"**{run.text}**")
-                        elif run.italic:
-                            formatted_text.append(f"*{run.text}*")
-                        else:
-                            formatted_text.append(run.text)
-
-                    content.append(f"{indent_spaces}{''.join(formatted_text)}")
-
-            return "\n\n".join(content)
-
-        except Exception as e:
-            logger.error(f"Failed to extract DOCX content: {e}")
-            return None
-
-    return await asyncio.get_event_loop().run_in_executor(None, _extract)
-
-
-async def get_docx_info(file_path):
-    """Get DOCX metadata and content"""
-
-    def _get_info():
-        try:
-            doc = Document(file_path)
-
-            # Extract core properties if available
-            core_props = {
-                "author": doc.core_properties.author,
-                "created": doc.core_properties.created,
-                "modified": doc.core_properties.modified,
-                "title": doc.core_properties.title,
-                "subject": doc.core_properties.subject,
-                "keywords": doc.core_properties.keywords,
-                "category": doc.core_properties.category,
-                "comments": doc.core_properties.comments,
-            }
-
-            # Get document content
-            content = extract_docx_content_detailed(file_path)
-
-            # Get document statistics
-            stats = {
-                "paragraph_count": len(doc.paragraphs),
-                "word_count": sum(
-                    len(p.text.split()) for p in doc.paragraphs if p.text.strip()
-                ),
-                "character_count": sum(
-                    len(p.text) for p in doc.paragraphs if p.text.strip()
-                ),
-            }
-
-            return {"metadata": core_props, "content": content, "statistics": stats}
-
-        except Exception as e:
-            logger.error(f"Failed to get DOCX info: {e}")
-            return None
-
-    return await asyncio.get_event_loop().run_in_executor(None, _get_info)
-
-
-async def extract_pptx_content(file_path):
-    """Extract content from PPTX file"""
-
-    def _extract():
-        try:
-            prs = Presentation(file_path)
-            content = []
-
-            for slide_number, slide in enumerate(prs.slides, 1):
-                content.append(f"\n# Slide {slide_number}\n")
-
-                # Extract title
-                if slide.shapes.title:
-                    content.append(f"## {slide.shapes.title.text}\n")
-
-                # Extract text from all shapes
-                for shape in slide.shapes:
-                    if hasattr(shape, "text") and shape.text.strip():
-                        if (
-                            shape != slide.shapes.title
-                        ):  # Skip title as it's already added
-                            content.append(shape.text.strip())
-
-            return "\n\n".join(content)
-
-        except Exception as e:
-            logger.error(f"Failed to extract PPTX content: {e}")
-            return None
-
-    return await asyncio.get_event_loop().run_in_executor(None, _extract)
-
-
-async def extract_xlsx_content(file_path, max_rows=10000, max_cols=100):
-    """Extract content from XLSX file"""
-
-    def _extract():
-        try:
-            wb = load_workbook(file_path, data_only=True)
-            content = []
-
-            for sheet in wb.sheetnames:
-                ws = wb[sheet]
-                content.append(f"\n# Sheet: {sheet}\n")
-
-                # Get the maximum row and column with data
-                max_row = min(ws.max_row, max_rows)
-                max_col = min(ws.max_column, max_cols)
-
-                # Create markdown table header
-                headers = []
-                for col in range(1, max_col + 1):
-                    cell_value = ws.cell(row=1, column=col).value
-                    headers.append(str(cell_value) if cell_value is not None else "")
-
-                content.append("| " + " | ".join(headers) + " |")
-                content.append("| " + " | ".join(["---"] * len(headers)) + " |")
-
-                # Add table content
-                for row in range(2, max_row + 1):
-                    row_data = []
-                    for col in range(1, max_col + 1):
-                        cell_value = ws.cell(row=row, column=col).value
-                        row_data.append(
-                            str(cell_value) if cell_value is not None else ""
-                        )
-                    content.append("| " + " | ".join(row_data) + " |")
-
-            return "\n".join(content)
-
-        except Exception as e:
-            logger.error(f"Failed to extract XLSX content: {e}")
-            return None
-
-    return await asyncio.get_event_loop().run_in_executor(None, partial(_extract))
-
-
-async def get_pptx_info(file_path):
-    """Get PPTX metadata and content"""
-
-    def _get_info():
-        try:
-            prs = Presentation(file_path)
-
-            # Extract basic properties
-            props = {
-                "slide_count": len(prs.slides),
-                "title": "",  # PowerPoint doesn't have built-in metadata like Word
-            }
-
-            # Get document content
-            content = extract_pptx_content(file_path)
-
-            # Get presentation statistics
-            stats = {
-                "slide_count": len(prs.slides),
-                "shape_count": sum(len(slide.shapes) for slide in prs.slides),
-                "text_frame_count": sum(
-                    sum(1 for shape in slide.shapes if hasattr(shape, "text"))
-                    for slide in prs.slides
-                ),
-            }
-
-            return {"metadata": props, "content": content, "statistics": stats}
-
-        except Exception as e:
-            logger.error(f"Failed to get PPTX info: {e}")
-            return None
-
-    return await asyncio.get_event_loop().run_in_executor(None, _get_info)
-
-
-async def get_xlsx_info(file_path):
-    """Get XLSX metadata and content"""
-
-    def _get_info():
-        try:
-            wb = load_workbook(file_path, data_only=True)
-
-            # Extract basic properties
-            props = {
-                "sheet_count": len(wb.sheetnames),
-                "sheets": wb.sheetnames,
-                "title": wb.properties.title,
-                "creator": wb.properties.creator,
-                "created": wb.properties.created,
-                "modified": wb.properties.modified,
-            }
-
-            # Get document content
-            content = extract_xlsx_content(file_path)
-
-            # Get workbook statistics
-            stats = {
-                "sheet_count": len(wb.sheetnames),
-                "total_rows": sum(sheet.max_row for sheet in wb.worksheets),
-                "total_columns": sum(sheet.max_column for sheet in wb.worksheets),
-            }
-
-            return {"metadata": props, "content": content, "statistics": stats}
-
-        except Exception as e:
-            logger.error(f"Failed to get XLSX info: {e}")
-            return None
-
-    return await asyncio.get_event_loop().run_in_executor(None, _get_info)
-
-
-async def extract_office_content(state: ContentState):
-    """Universal function to extract content from Office files"""
-    assert state.get("file_path"), "No file path provided"
-    assert (
-        state.get("identified_type") in SUPPORTED_OFFICE_TYPES
-    ), "Unsupported File Type"
-    file_path = state["file_path"]
-    doc_type = state["identified_type"]
-
-    if (
-        doc_type
-        == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-    ):
-        logger.debug("Extracting content from DOCX file")
-        content = await extract_docx_content_detailed(file_path)
-        info = await get_docx_info(file_path)
-    elif (
-        doc_type
-        == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
-    ):
-        logger.debug("Extracting content from PPTX file")
-        content = await extract_pptx_content(file_path)
-        info = await get_pptx_info(file_path)
-    elif (
-        doc_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-    ):
-        logger.debug("Extracting content from XLSX file")
-        content = await extract_xlsx_content(file_path)
-        info = await get_xlsx_info(file_path)
-    else:
-        raise Exception(f"Unsupported file format: {doc_type}")
-
-    del info["content"]
-    return {"content": content, "metadata": info}
--- a/open_notebook/graphs/content_processing/pdf.py
+++ b/open_notebook/graphs/content_processing/pdf.py
@ -1,170 +0,0 @@
-import asyncio
-import re
-import unicodedata
-
-import fitz  # type: ignore
-from loguru import logger
-
-from open_notebook.graphs.content_processing.state import ContentState
-
-# todo: find tables - https://pymupdf.readthedocs.io/en/latest/the-basics.html#extracting-tables-from-a-page
-# todo: what else can we do to make the text more readable?
-# todo: try to fix encoding for some PDF that is still breaking
-# def _extract_text_from_pdf(pdf_path):
-#     doc = fitz.open(pdf_path)
-#     text = ""
-#     logger.debug(f"Found {len(doc)} pages in PDF")
-#     for page in doc:
-#         # Use encode/decode if you need to clean up any encoding issues
-#         text += page.get_text().encode('utf-8').decode('utf-8')
-#     doc.close()
-#     return text
-
-SUPPORTED_FITZ_TYPES = [
-    "application/pdf",
-    "application/epub+zip",
-]
-
-
-def clean_pdf_text(text):
-    """
-    Clean text extracted from PDFs with enhanced space handling.
-    Preserves special characters like (, ), %, = that are valid in code/math.
-
-    Args:
-        text (str): The raw text extracted from a PDF
-    Returns:
-        str: Cleaned text with minimal necessary spacing
-    """
-    if not text:
-        return text
-
-    # Step 1: Normalize Unicode characters
-    text = unicodedata.normalize("NFKC", text)
-
-    # Step 2: Replace common PDF artifacts
-    replacements = {
-        # Common ligatures
-        "ﬁ": "fi",
-        "ﬂ": "fl",
-        "ﬀ": "ff",
-        "ﬃ": "ffi",
-        "ﬄ": "ffl",
-        # Quotation marks and apostrophes
-        """: "'", """: "'",
-        '"': '"',
-        "′": "'",
-        "‚": ",",
-        "„": '"',
-        # Dashes and hyphens
-        "‒": "-",
-        "–": "-",
-        "—": "-",
-        "―": "-",
-        # Other common replacements
-        "…": "...",
-        "•": "*",
-        "°": " degrees ",
-        "¹": "1",
-        "²": "2",
-        "³": "3",
-        "©": "(c)",
-        "®": "(R)",
-        "™": "(TM)",
-    }
-    for old, new in replacements.items():
-        text = text.replace(old, new)
-
-    # Step 3: Clean control characters while preserving essential whitespace and special chars
-    text = "".join(
-        char
-        for char in text
-        if unicodedata.category(char)[0] != "C"
-        or char in "\n\t "
-        or char in "()%=[]{}#$@!?.,;:+-*/^<>&|~"
-    )
-
-    # Step 4: Enhanced space cleaning
-    text = re.sub(r"[ \t]+", " ", text)  # Consolidate horizontal whitespace
-    text = re.sub(r" +\n", "\n", text)  # Remove spaces before newlines
-    text = re.sub(r"\n +", "\n", text)  # Remove spaces after newlines
-    text = re.sub(r"\n\t+", "\n", text)  # Remove tabs at start of lines
-    text = re.sub(r"\t+\n", "\n", text)  # Remove tabs at end of lines
-    text = re.sub(r"\t+", " ", text)  # Replace tabs with single space
-
-    # Step 5: Remove empty lines while preserving paragraph structure
-    text = re.sub(r"\n{3,}", "\n\n", text)  # Max two consecutive newlines
-    text = re.sub(r"^\s+", "", text)  # Remove leading whitespace
-    text = re.sub(r"\s+$", "", text)  # Remove trailing whitespace
-
-    # Step 6: Clean up around punctuation
-    text = re.sub(r"\s+([.,;:!?)])", r"\1", text)  # Remove spaces before punctuation
-    text = re.sub(r"(\()\s+", r"\1", text)  # Remove spaces after opening parenthesis
-    text = re.sub(
-        r"\s+([.,])\s+", r"\1 ", text
-    )  # Ensure single space after periods and commas
-
-    # Step 7: Remove zero-width and invisible characters
-    text = re.sub(r"[\u200b\u200c\u200d\ufeff\u200e\u200f]", "", text)
-
-    # Step 8: Fix hyphenation and line breaks
-    text = re.sub(
-        r"(?<=\w)-\s*\n\s*(?=\w)", "", text
-    )  # Remove hyphenation at line breaks
-
-    return text.strip()
-
-
-async def _extract_text_from_pdf(pdf_path):
-    doc = fitz.open(pdf_path)
-    try:
-        text = ""
-        logger.debug(f"Found {len(doc)} pages in PDF")
-        for page in doc:
-            text += page.get_text()
-        normalized_text = clean_pdf_text(text)
-        return normalized_text
-    finally:
-        doc.close()
-
-
-async def _extract_text_from_pdf(pdf_path):
-    """Extract text from PDF asynchronously"""
-
-    def _extract():
-        doc = fitz.open(pdf_path)
-        try:
-            text = ""
-            logger.debug(f"Found {len(doc)} pages in PDF")
-            for page in doc:
-                text += page.get_text()
-            return clean_pdf_text(text)
-        finally:
-            doc.close()
-
-    # Run CPU-bound PDF processing in a thread pool
-    return await asyncio.get_event_loop().run_in_executor(None, _extract)
-
-
-async def extract_pdf(state: ContentState):
-    """
-    Parse the PDF file and extract its content asynchronously.
-    """
-    return_dict = {}
-    assert state.get("file_path"), "No file path provided"
-    assert state.get("identified_type") in SUPPORTED_FITZ_TYPES, "Unsupported File Type"
-
-    if (
-        state.get("file_path") is not None
-        and state.get("identified_type") in SUPPORTED_FITZ_TYPES
-    ):
-        file_path = state.get("file_path")
-        try:
-            text = await _extract_text_from_pdf(file_path)
-            return_dict["content"] = text
-        except FileNotFoundError:
-            raise FileNotFoundError(f"File not found at {file_path}")
-        except Exception as e:
-            raise Exception(f"An error occurred: {e}")
-
-    return return_dict
--- a/open_notebook/graphs/content_processing/state.py
+++ b/open_notebook/graphs/content_processing/state.py
@ -1,13 +0,0 @@
-from typing_extensions import TypedDict
-
-
-class ContentState(TypedDict):
-    content: str
-    file_path: str
-    url: str
-    title: str
-    source_type: str
-    identified_type: str
-    identified_provider: str
-    metadata: dict
-    delete_source: bool = False
--- a/open_notebook/graphs/content_processing/text.py
+++ b/open_notebook/graphs/content_processing/text.py
@ -1,40 +0,0 @@
-import asyncio
-
-from loguru import logger
-
-from open_notebook.graphs.content_processing.state import ContentState
-
-
-async def extract_txt(state: ContentState):
-    """
-    Parse the text file and extract its content asynchronously.
-    """
-    return_dict = {}
-    if (
-        state.get("file_path") is not None
-        and state.get("identified_type") == "text/plain"
-    ):
-        logger.debug(f"Extracting text from {state.get('file_path')}")
-        file_path = state.get("file_path")
-
-        if file_path is not None:
-            try:
-
-                def _read_file():
-                    with open(file_path, "r", encoding="utf-8") as file:
-                        return file.read()
-
-                # Run file I/O in thread pool
-                content = await asyncio.get_event_loop().run_in_executor(
-                    None, _read_file
-                )
-
-                logger.debug(f"Extracted: {content[:100]}")
-                return_dict["content"] = content
-
-            except FileNotFoundError:
-                raise FileNotFoundError(f"File not found at {file_path}")
-            except Exception as e:
-                raise Exception(f"An error occurred: {e}")
-
-    return return_dict
--- a/open_notebook/graphs/content_processing/url.py
+++ b/open_notebook/graphs/content_processing/url.py
@ -1,191 +0,0 @@
-import re
-from urllib.parse import urlparse
-
-import aiohttp
-from bs4 import BeautifulSoup, Comment
-from loguru import logger
-
-from open_notebook.graphs.content_processing.state import ContentState
-
-# future: better extraction methods
-# https://github.com/buriy/python-readability
-# also try readability: from readability import Document
-
-
-def url_provider(state: ContentState):
-    """
-    Identify the provider
-    """
-    return_dict = {}
-    url = state.get("url")
-    if url:
-        if "youtube.com" in url or "youtu.be" in url:
-            return_dict["identified_type"] = (
-                "youtube"  # future: playlists, channels in the future
-            )
-        else:
-            return_dict["identified_type"] = "article"
-            # future: article providers in the future
-    return return_dict
-
-
-async def extract_url_bs4(url: str):
-    """
-    Get the title and content of a URL using bs4
-    """
-    try:
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-        }
-
-        # If URL is actually HTML content
-        if url.startswith("<!DOCTYPE html>") or url.startswith("<html"):
-            html_content = url
-        else:
-            async with aiohttp.ClientSession() as session:
-                async with session.get(url, headers=headers, timeout=10) as response:
-                    response.raise_for_status()
-                    html_content = await response.text()
-
-        soup = BeautifulSoup(html_content, "html.parser")
-
-        # Remove unwanted elements
-        for element in soup.find_all(
-            ["script", "style", "nav", "footer", "iframe", "noscript", "ad"]
-        ):
-            element.decompose()
-
-        # Remove comments
-        for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
-            comment.extract()
-
-        # Get title
-        title = None
-        title_tags = [
-            soup.find("meta", property="og:title"),
-            soup.find("meta", property="twitter:title"),
-            soup.find("title"),
-            soup.find("h1"),
-        ]
-
-        for tag in title_tags:
-            if tag:
-                if tag.string:
-                    title = tag.string
-                elif tag.get("content"):
-                    title = tag.get("content")
-                break
-
-        # Clean up title
-        if title:
-            title = " ".join(title.split())
-            title = re.sub(r"\s*\|.*$", "", title)
-            title = re.sub(r"\s*-.*$", "", title)
-
-        # Get content
-        content = []
-
-        # Look for main article content
-        main_content = None
-        content_tags = [
-            soup.find("article"),
-            soup.find("main"),
-            soup.find(class_=re.compile(r"article|post|content|entry|document")),
-            soup.find(id=re.compile(r"article|post|content|entry|main")),
-        ]
-
-        for tag in content_tags:
-            if tag:
-                main_content = tag
-                break
-
-        if not main_content:
-            main_content = soup
-
-        # Process content
-        for element in main_content.find_all(
-            ["p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "div"]
-        ):
-            # Handle code blocks
-            if element.name == "pre" or "highlight" in element.get("class", []):
-                code_text = element.get_text().strip()
-                if code_text:
-                    content.append("\n```\n" + code_text + "\n```\n")
-                continue
-
-            # Handle regular text
-            text = element.get_text().strip()
-            if text:
-                # Skip if text matches common patterns for navigation/footer
-                if re.search(
-                    r"copyright|all rights reserved|privacy policy|terms of use",
-                    text.lower(),
-                ):
-                    continue
-
-                content.append(text)
-
-        # Join content with proper spacing
-        final_content = "\n\n".join(content)
-
-        # Clean up content
-        final_content = re.sub(
-            r"\n\s*\n\s*\n", "\n\n", final_content
-        )  # Remove extra newlines
-        final_content = re.sub(r" +", " ", final_content)  # Normalize whitespace
-        final_content = final_content.strip()
-
-        return {
-            "title": title,
-            "content": final_content,
-            "domain": urlparse(url).netloc
-            if not url.startswith("<!DOCTYPE html>")
-            else None,
-            "url": url if not url.startswith("<!DOCTYPE html>") else None,
-        }
-
-    except aiohttp.ClientError as e:
-        logger.error(f"Failed to fetch URL {url}: {e}")
-        return None
-    except Exception as e:
-        logger.error(f"Failed to process content: {e}")
-        return None
-
-
-async def extract_url_jina(url: str):
-    """
-    Get the content of a URL using Jina
-    """
-    async with aiohttp.ClientSession() as session:
-        async with session.get(f"https://r.jina.ai/{url}") as response:
-            text = await response.text()
-            if text.startswith("Title:") and "\n" in text:
-                title_end = text.index("\n")
-                title = text[6:title_end].strip()
-                content = text[title_end + 1 :].strip()
-                logger.debug(
-                    f"Processed url: {url}, found title: {title}, content: {content[:100]}..."
-                )
-                return {"title": title, "content": content}
-            else:
-                logger.debug(
-                    f"Processed url: {url}, does not have Title prefix, returning full content: {text[:100]}..."
-                )
-                return {"content": text}
-
-
-async def extract_url(state: ContentState):
-    assert state.get("url"), "No URL provided"
-    url = state["url"]
-    try:
-        result = await extract_url_bs4(url)
-        if not result or not result.get("content"):
-            logger.debug(
-                f"BS4 extraction failed for url {url}, falling back to Jina extractor"
-            )
-            result = await extract_url_jina(url)
-        return result
-    except Exception as e:
-        logger.error(f"URL extraction failed for URL: {url}")
-        logger.exception(e)
-        return None
--- a/open_notebook/graphs/content_processing/video.py
+++ b/open_notebook/graphs/content_processing/video.py
@ -1,167 +0,0 @@
-import asyncio
-import json
-import os
-import subprocess
-from functools import partial
-
-from loguru import logger
-
-from open_notebook.graphs.content_processing.state import ContentState
-
-
-async def extract_audio_from_video(input_file, output_file, stream_index):
-    """
-    Extract the specified audio stream to MP3 format asynchronously
-    """
-
-    def _extract(input_file, output_file, stream_index):
-        try:
-            cmd = [
-                "ffmpeg",
-                "-i",
-                input_file,
-                "-map",
-                f"0:a:{stream_index}",  # Select specific audio stream
-                "-codec:a",
-                "libmp3lame",  # Use MP3 codec
-                "-q:a",
-                "2",  # High quality setting
-                "-y",  # Overwrite output file if exists
-                output_file,
-            ]
-
-            result = subprocess.run(cmd, capture_output=True, text=True)
-            if result.returncode != 0:
-                raise Exception(f"FFmpeg failed: {result.stderr}")
-
-            return True
-
-        except Exception as e:
-            logger.error(f"Error extracting audio: {str(e)}")
-            return False
-
-    return await asyncio.get_event_loop().run_in_executor(
-        None, partial(_extract, input_file, output_file, stream_index)
-    )
-
-
-async def get_audio_streams(input_file):
-    """
-    Analyze video file and return information about all audio streams asynchronously
-    """
-
-    def _analyze(input_file):
-        logger.debug(f"Analyzing video file {input_file} for audio streams")
-        try:
-            cmd = [
-                "ffprobe",
-                "-v",
-                "quiet",
-                "-print_format",
-                "json",
-                "-show_streams",
-                "-select_streams",
-                "a",
-                input_file,
-            ]
-
-            result = subprocess.run(cmd, capture_output=True, text=True)
-            if result.returncode != 0:
-                raise Exception(f"FFprobe failed: {result.stderr}")
-
-            data = json.loads(result.stdout)
-            return data.get("streams", [])
-
-        except Exception as e:
-            logger.error(f"Error analyzing file: {str(e)}")
-            return []
-
-    return await asyncio.get_event_loop().run_in_executor(
-        None, partial(_analyze, input_file)
-    )
-
-
-async def select_best_audio_stream(streams):
-    """
-    Select the best audio stream based on various quality metrics
-    """
-
-    def _select(streams):
-        if not streams:
-            logger.debug("No audio streams found")
-            return None
-        else:
-            logger.debug(f"Found {len(streams)} audio streams")
-
-        # Score each stream based on various factors
-        scored_streams = []
-        for stream in streams:
-            score = 0
-
-            # Prefer higher bit rates
-            bit_rate = stream.get("bit_rate")
-            if bit_rate:
-                score += int(int(bit_rate) / 1000000)  # Convert to Mbps and ensure int
-
-            # Prefer more channels (stereo over mono)
-            channels = stream.get("channels", 0)
-            score += channels * 10
-
-            # Prefer higher sample rates
-            sample_rate = stream.get("sample_rate", "0")
-            score += int(int(sample_rate) / 48000)
-
-            scored_streams.append((score, stream))
-
-        # Return the stream with highest score
-        return max(scored_streams, key=lambda x: x[0])[1]
-
-    return await asyncio.get_event_loop().run_in_executor(
-        None, partial(_select, streams)
-    )
-
-
-async def extract_best_audio_from_video(data: ContentState):
-    """
-    Main function to extract the best audio stream from a video file asynchronously
-    """
-    input_file = data.get("file_path")
-    assert input_file is not None, "Input file path must be provided"
-
-    def _check_file(path):
-        return os.path.exists(path)
-
-    file_exists = await asyncio.get_event_loop().run_in_executor(
-        None, partial(_check_file, input_file)
-    )
-
-    if not file_exists:
-        logger.critical(f"Input file not found: {input_file}")
-        return False
-
-    base_name = os.path.splitext(input_file)[0]
-    output_file = f"{base_name}_audio.mp3"
-
-    # Get all audio streams
-    streams = await get_audio_streams(input_file)
-    if not streams:
-        logger.debug("No audio streams found in the file")
-        return False
-
-    # Select best stream
-    best_stream = await select_best_audio_stream(streams)
-    if not best_stream:
-        logger.error("Could not determine best audio stream")
-        return False
-
-    # Extract the selected stream
-    stream_index = streams.index(best_stream)
-    success = await extract_audio_from_video(input_file, output_file, stream_index)
-
-    if success:
-        logger.debug(f"Successfully extracted audio to: {output_file}")
-        logger.debug(f"- Channels: {best_stream.get('channels', 'unknown')}")
-        logger.debug(f"- Sample rate: {best_stream.get('sample_rate', 'unknown')} Hz")
-        logger.debug(f"- Bit rate: {best_stream.get('bit_rate', 'unknown')} bits/s")
-
-    return {"file_path": output_file, "identified_type": "audio/mp3"}
--- a/open_notebook/graphs/content_processing/youtube.py
+++ b/open_notebook/graphs/content_processing/youtube.py
@ -1,159 +0,0 @@
-import re
-import ssl
-
-import aiohttp
-from bs4 import BeautifulSoup
-from loguru import logger
-from youtube_transcript_api import YouTubeTranscriptApi  # type: ignore
-from youtube_transcript_api.formatters import TextFormatter  # type: ignore
-
-from open_notebook.config import CONFIG
-from open_notebook.exceptions import NoTranscriptFound
-from open_notebook.graphs.content_processing.state import ContentState
-
-ssl._create_default_https_context = ssl._create_unverified_context
-
-
-async def get_video_title(video_id):
-    try:
-        url = f"https://www.youtube.com/watch?v={video_id}"
-        async with aiohttp.ClientSession() as session:
-            async with session.get(url) as response:
-                html = await response.text()
-
-        # BeautifulSoup doesn't support async operations
-        soup = BeautifulSoup(html, "html.parser")
-
-        # YouTube stores title in a meta tag
-        title = soup.find("meta", property="og:title")["content"]
-        return title
-
-    except Exception as e:
-        logger.error(f"Failed to get video title: {e}")
-        return None
-
-
-def _extract_youtube_id(url):
-    """
-    Extract the YouTube video ID from a given URL using regular expressions.
-
-    Args:
-    url (str): The YouTube URL from which to extract the video ID.
-
-    Returns:
-    str: The extracted YouTube video ID or None if no valid ID is found.
-    """
-    # Define a regular expression pattern to capture the YouTube video ID
-    youtube_regex = (
-        r"(?:https?://)?"  # Optional scheme
-        r"(?:www\.)?"  # Optional www.
-        r"(?:"
-        r"youtu\.be/"  # Shortened URL
-        r"|youtube\.com"  # Main URL
-        r"(?:"  # Group start
-        r"/embed/"  # Embed URL
-        r"|/v/"  # Older video URL
-        r"|/watch\?v="  # Standard watch URL
-        r"|/watch\?.+&v="  # Other watch URL
-        r")"  # Group end
-        r")"  # End main group
-        r"([\w-]{11})"  # 11 characters (YouTube video ID)
-    )
-
-    # Search the URL for the pattern
-    match = re.search(youtube_regex, url)
-
-    # Return the video ID if a match is found
-    return match.group(1) if match else None
-
-
-async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
-    try:
-        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
-
-        # First try: Manual transcripts in preferred languages
-        manual_transcripts = []
-        try:
-            for transcript in transcript_list:
-                if not transcript.is_generated and not transcript.is_translatable:
-                    manual_transcripts.append(transcript)
-
-            if manual_transcripts:
-                # Sort based on preferred language order
-                for lang in preferred_langs:
-                    for transcript in manual_transcripts:
-                        if transcript.language_code == lang:
-                            return transcript.fetch()
-                # If no preferred language found, return first manual transcript
-                return manual_transcripts[0].fetch()
-        except NoTranscriptFound:
-            pass
-
-        # Second try: Auto-generated transcripts in preferred languages
-        generated_transcripts = []
-        try:
-            for transcript in transcript_list:
-                if transcript.is_generated and not transcript.is_translatable:
-                    generated_transcripts.append(transcript)
-
-            if generated_transcripts:
-                # Sort based on preferred language order
-                for lang in preferred_langs:
-                    for transcript in generated_transcripts:
-                        if transcript.language_code == lang:
-                            return transcript.fetch()
-                # If no preferred language found, return first generated transcript
-                return generated_transcripts[0].fetch()
-        except NoTranscriptFound:
-            pass
-
-        # Last try: Translated transcripts in preferred languages
-        translated_transcripts = []
-        try:
-            for transcript in transcript_list:
-                if transcript.is_translatable:
-                    translated_transcripts.append(transcript)
-
-            if translated_transcripts:
-                # Sort based on preferred language order
-                for lang in preferred_langs:
-                    for transcript in translated_transcripts:
-                        if transcript.language_code == lang:
-                            return transcript.fetch()
-                # If no preferred language found, return translation to first preferred language
-                translation = translated_transcripts[0].translate(preferred_langs[0])
-                return translation.fetch()
-        except NoTranscriptFound:
-            pass
-
-        raise Exception("No suitable transcript found")
-
-    except Exception as e:
-        logger.error(f"Failed to get transcript for video {video_id}: {e}")
-        return None
-
-
-async def extract_youtube_transcript(state: ContentState):
-    """
-    Parse the text file and print its content.
-    """
-
-    languages = CONFIG.get("youtube_transcripts", {}).get(
-        "preferred_languages", ["en", "es", "pt"]
-    )
-
-    video_id = _extract_youtube_id(state.get("url"))
-    transcript = await get_best_transcript(video_id, languages)
-
-    logger.debug(f"Found transcript: {transcript}")
-    formatter = TextFormatter()
-    try:
-        title = await get_video_title(video_id)
-    except Exception as e:
-        logger.critical(f"Failed to get video title for video_id: {video_id}")
-        logger.exception(e)
-        title = None
-    return {
-        "content": formatter.format_transcript(transcript),
-        "title": title,
-    }
--- a/open_notebook/graphs/source.py
+++ b/open_notebook/graphs/source.py
@ -1,24 +1,23 @@
 import operator
-from typing import List, Optional
+from typing import Any, Dict, List, Optional

-from langchain_core.runnables import (
-    RunnableConfig,
-)
+from content_core import extract_content
+from content_core.common import ProcessSourceState
+from langchain_core.runnables import RunnableConfig
 from langgraph.graph import END, START, StateGraph
 from langgraph.types import Send
 from loguru import logger
 from typing_extensions import Annotated, TypedDict

+from open_notebook.domain.content_settings import ContentSettings
 from open_notebook.domain.notebook import Asset, Source
 from open_notebook.domain.transformation import Transformation
-from open_notebook.graphs.content_processing import ContentState
-from open_notebook.graphs.content_processing import graph as content_graph
 from open_notebook.graphs.transformation import graph as transform_graph
 from open_notebook.utils import surreal_clean


 class SourceState(TypedDict):
-    content_state: ContentState
+    content_state: ProcessSourceState
    apply_transformations: List[Transformation]
    notebook_id: str
    source: Source
@ -32,9 +31,18 @@ class TransformationState(TypedDict):


 async def content_process(state: SourceState) -> dict:
-    content_state = state["content_state"]
-    logger.info("Content processing started for new content")
-    processed_state = await content_graph.ainvoke(content_state)
+    content_settings = ContentSettings()
+    content_state: Dict[str, Any] = state["content_state"]
+
+    content_state["url_engine"] = (
+        content_settings.default_content_processing_engine_url or "auto"
+    )
+    content_state["document_engine"] = (
+        content_settings.default_content_processing_engine_doc or "auto"
+    )
+    content_state["output_format"] = "markdown"
+
+    processed_state = await extract_content(content_state)
    return {"content_state": processed_state}


@ -42,11 +50,9 @@ def save_source(state: SourceState) -> dict:
    content_state = state["content_state"]

    source = Source(
-        asset=Asset(
-            url=content_state.get("url"), file_path=content_state.get("file_path")
-        ),
-        full_text=surreal_clean(content_state["content"]),
-        title=content_state.get("title"),
+        asset=Asset(url=content_state.url, file_path=content_state.file_path),
+        full_text=surreal_clean(content_state.content),
+        title=content_state.title,
    )
    source.save()

--- a/open_notebook/plugins/podcasts.py
+++ b/open_notebook/plugins/podcasts.py
@ -108,7 +108,7 @@ class PodcastConfig(ObjectModel):
                api_key_label = "ANTHROPIC_API_KEY"
                llm_model_name = self.transcript_model
            elif self.transcript_model_provider == "gemini":
-                api_key_label = "GEMINI_API_KEY"
+                api_key_label = "GOOGLE_API_KEY"
                llm_model_name = self.transcript_model

        if self.provider == "gemini":
--- a/pages/10_⚙️_Settings.py
+++ b/pages/10_⚙️_Settings.py
@ -0,0 +1,122 @@
+import os
+
+import streamlit as st
+
+from open_notebook.domain.content_settings import ContentSettings
+from pages.stream_app.utils import setup_page
+
+setup_page("⚙️ Settings")
+
+st.header("⚙️ Settings")
+
+content_settings = ContentSettings()
+
+with st.container(border=True):
+    st.markdown("**Content Processing Engine for Documents**")
+
+    default_content_processing_engine_doc = st.selectbox(
+        "Default Content Processing Engine for Documents",
+        ["auto", "docling", "simple"],
+        index=(
+            ["auto", "docling", "simple"].index(
+                content_settings.default_content_processing_engine_doc
+            )
+            if content_settings.default_content_processing_engine_doc
+            else 0
+        ),
+    )
+    with st.expander("Help me choose"):
+        st.markdown(
+            "- Docling is a little slower but more accurate, specially if the documents contain tables and images.\n- Simple will extract any content from the document without formatiing it. It's ok for simple documents, but will lose quality in complex ones.\n- Auto (recommended) will try to process through docling and default to simple."
+        )
+
+
+with st.container(border=True):
+    st.markdown("**Content Processing Engine for URLs**")
+    firecrawl_enabled = os.getenv("FIRECRAWL_API_KEY") is not None
+    jina_enabled = os.getenv("JINA_API_KEY") is not None
+
+    default_content_processing_engine_url = st.selectbox(
+        "Default Content Processing Engine for URLs",
+        ["auto", "firecrawl", "jina", "simple"],
+        index=(
+            ["auto", "firecrawl", "jina", "simple"].index(
+                content_settings.default_content_processing_engine_url
+            )
+            if content_settings.default_content_processing_engine_url
+            else 0
+        ),
+    )
+    if not firecrawl_enabled and default_content_processing_engine_url in [
+        "firecrawl",
+        "auto",
+    ]:
+        st.warning(
+            "Firecrawl API Key missing. You need to add FIRECRAWL_API_KEY to use it. Get a key at [Firecrawl](https://firecrawl.dev/). If you don't add one, it will default to Jina."
+        )
+    if not jina_enabled and default_content_processing_engine_url in [
+        "jina",
+        "auto",
+    ]:
+        st.warning(
+            "Jina API Key missing. It will work for a few requests a day, but fallback to simple afterwards. Please add JINA_API_KEY to prevent that. Get a key at [Jina.ai](https://jina.ai/)."
+        )
+    with st.expander("Help me choose"):
+        st.markdown(
+            "- Firecrawl is a paid service (with a free tier), and very powerful.\n- Jina is a good option as well and also has a free tier.\n- Simple will use basic HTTP extraction and will miss content on javascript-based websites.\n- Auto (recommended) will try to use firecrawl (if API Key is present). Then, it will use Jina until reaches the limit (or will keep using Jina if you setup the API Key). It will fallback to simple, when none of the previous options is possible."
+        )
+
+with st.container(border=True):
+    st.markdown("**Content Embedding for Vector Search**")
+
+    default_embedding_option = st.selectbox(
+        "Default Embedding Option for Vector Search",
+        ["ask", "always", "never"],
+        index=(
+            ["ask", "always", "never"].index(content_settings.default_embedding_option)
+            if content_settings.default_embedding_option
+            else 0
+        ),
+    )
+
+    with st.expander("Help me choose"):
+        st.markdown(
+            "Embedding the content will make it easier to find by you and by your AI agents. If you are running a local embedding model (Ollama, for example), you shouldn't worry about cost and just embed everything. For online providers, you migtht want to be careful only if you process a lot of content (like 100s of documents at a day)."
+        )
+        st.markdown(
+            "\n\n- Choose **always** if you are running a local embedding model or if your content volume is not that big\n- Choose **ask** if you want to decide every time\n- Choose **never** if you don't care about vector search or do not have an embedding provider."
+        )
+        st.markdown(
+            "As a reference, OpenAI's text-embedding-3-small costs about 0.02 for 1 million tokens -- which is about 30 times the [Wikipedia page for Earth](https://en.wikipedia.org/wiki/Earth). With Gemini API, Text Embedding 004 is free with a rate limit of 1500 requests per minute."
+        )
+
+with st.container(border=True):
+    st.markdown("**Auto Delete Uploaded Files**")
+    auto_delete_files = st.selectbox(
+        "Auto Delete Uploaded Files",
+        ["yes", "no"],
+        index=(
+            ["yes", "no"].index(content_settings.auto_delete_files)
+            if content_settings.auto_delete_files
+            else 0
+        ),
+    )
+    with st.expander("Help me choose"):
+        st.markdown(
+            "Once your files are uploaded and processed, they are not required anymore. Most users should allow Open Notebook to delete uploaded files from the upload folder automatically. Choose **no**, ONLY if you are using Notebook as the primary storage location for those files (which you shouldn't be at all). This option will soon be deprecated in favor of always downloading the files."
+        )
+        st.markdown(
+            "\n\n- Choose **yes** if you are running a local embedding model or if your content volume is not that big\n- Choose **ask** if you want to decide every time\n- Choose **never** if you don't care about vector search or do not have an embedding provider."
+        )
+
+if st.button("Save", key="save_settings"):
+    content_settings.default_content_processing_engine_doc = (
+        default_content_processing_engine_doc
+    )
+    content_settings.default_content_processing_engine_url = (
+        default_content_processing_engine_url
+    )
+    content_settings.default_embedding_option = default_embedding_option
+    content_settings.auto_delete_files = auto_delete_files
+    content_settings.update()
+    st.toast("Settings saved successfully!")
--- a/pages/3_🔍_Ask_and_Search.py
+++ b/pages/3_🔍_Ask_and_Search.py
@ -1,5 +1,6 @@
 import asyncio

+import nest_asyncio
 import streamlit as st

 from open_notebook.domain.models import DefaultModels, model_manager
@ -8,6 +9,8 @@ from open_notebook.graphs.ask import graph as ask_graph
 from pages.components.model_selector import model_selector
 from pages.stream_app.utils import convert_source_references, setup_page

+nest_asyncio.apply()
+
 setup_page("🔍 Search")

 ask_tab, search_tab = st.tabs(["Ask Your Knowledge Base (beta)", "Search"])
--- a/pages/7_🤖_Models.py
+++ b/pages/7_🤖_Models.py
@ -39,7 +39,7 @@ provider_status["vertexai-anthropic"] = (
    and os.environ.get("VERTEX_LOCATION") is not None
    and os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") is not None
 )
-provider_status["gemini"] = os.environ.get("GEMINI_API_KEY") is not None
+provider_status["gemini"] = os.environ.get("GOOGLE_API_KEY") is not None
 provider_status["openrouter"] = (
    os.environ.get("OPENROUTER_API_KEY") is not None
    and os.environ.get("OPENAI_API_KEY") is not None
--- a/pages/components/source_panel.py
+++ b/pages/components/source_panel.py
@ -1,7 +1,7 @@
 import asyncio

+import nest_asyncio
 import streamlit as st
-import streamlit_scrollable_textbox as stx  # type: ignore
 from humanize import naturaltime

 from open_notebook.domain.models import model_manager
@ -10,6 +10,8 @@ from open_notebook.domain.transformation import Transformation
 from open_notebook.graphs.transformation import graph as transform_graph
 from pages.stream_app.utils import check_models

+nest_asyncio.apply()
+

 def source_panel(source_id: str, notebook_id=None, modal=False):
    check_models(only_mandatory=False)
@ -100,4 +102,4 @@ def source_panel(source_id: str, notebook_id=None, modal=False):

    with source_tab:
        st.subheader("Content")
-        stx.scrollableTextbox(source.full_text, height=300)
+        st.markdown(source.full_text)
--- a/pages/stream_app/source.py
+++ b/pages/stream_app/source.py
@ -2,19 +2,22 @@ import asyncio
 import os
 from pathlib import Path

+import nest_asyncio
 import streamlit as st
 from humanize import naturaltime
 from loguru import logger

 from open_notebook.config import UPLOADS_FOLDER
+from open_notebook.domain.content_settings import ContentSettings
 from open_notebook.domain.models import model_manager
 from open_notebook.domain.notebook import Source
 from open_notebook.domain.transformation import Transformation
 from open_notebook.exceptions import UnsupportedTypeException
 from open_notebook.graphs.source import source_graph
 from pages.components import source_panel
+from pages.stream_app.consts import source_context_icons

-from .consts import source_context_icons
+nest_asyncio.apply()


@st.dialog("Source", width="large")
@ -31,6 +34,7 @@ def add_source(notebook_id):
    source_link = None
    source_file = None
    source_text = None
+    content_settings = ContentSettings()
    source_type = st.radio("Type", ["Link", "Upload", "Text"])
    req = {}
    transformations = Transformation.get_all()
@ -39,7 +43,7 @@ def add_source(notebook_id):
        req["url"] = source_link
    elif source_type == "Upload":
        source_file = st.file_uploader("Upload")
-        req["delete_source"] = st.checkbox("Delete source after processing", value=True)
+        req["delete_source"] = content_settings.auto_delete_files == "yes"

    else:
        source_text = st.text_area("Text")
@ -53,10 +57,22 @@ def add_source(notebook_id):
        format_func=lambda t: t.name,
        default=default_transformations,
    )
-    run_embed = st.checkbox(
-        "Embed content for vector search",
-        help="Creates an embedded content for vector search. Costs a little money and takes a little bit more time. You can do this later if you prefer.",
-    )
+    if content_settings.default_embedding_option == "ask":
+        run_embed = st.checkbox(
+            "Embed content for vector search",
+            help="Creates an embedded content for vector search. Costs a little money and takes a little bit more time. You can do this later if you prefer.",
+        )
+        if not run_embed:
+            st.caption("You can always embed later by clicking on the source.")
+    elif content_settings.default_embedding_option == "always":
+        st.caption("Embedding content for vector search automatically")
+        run_embed = True
+    else:
+        st.caption(
+            "Not embedding content for vector search as per settings. You can always embed later by clicking on the source."
+        )
+        run_embed = False
+
    if st.button("Process", key="add_source"):
        logger.debug("Adding source")
        with st.status("Processing...", expanded=True):
--- a/pages/stream_app/utils.py
+++ b/pages/stream_app/utils.py
@ -98,15 +98,19 @@ def setup_stream_state(current_notebook: Notebook) -> ChatSession:


 def check_migration():
-    logger.critical("Running migration check")
-    mm = MigrationManager()
-    if mm.needs_migration:
-        st.warning("The Open Notebook database needs a migration to run properly.")
-        if st.button("Run Migration"):
-            mm.run_migration_up()
-            st.success("Migration successful")
-            st.rerun()
-        st.stop()
+    if "migration_required" not in st.session_state:
+        st.session_state["migration_required"] = None
+        logger.critical("Running migration check")
+        mm = MigrationManager()
+        if mm.needs_migration:
+            st.warning("The Open Notebook database needs a migration to run properly.")
+            if st.button("Run Migration"):
+                mm.run_migration_up()
+                st.success("Migration successful")
+                st.rerun()
+            st.stop()
+        else:
+            st.session_state["migration_required"] = False


 def check_models(only_mandatory=True, stop_on_error=True):
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "open-notebook"
-version = "0.1.2"
+version = "0.2.0"
 description = "An open source implementation of a research assistant, inspired by Google Notebook LM"
 authors = [
    {name = "Luis Novo", email = "lfnovo@gmail.com"}
@ -14,7 +14,6 @@ classifiers = [
 requires-python = ">=3.11,<3.13"
 dependencies = [
    "streamlit>=1.39.0",
-    "watchdog>=5.0.3",
    "pydantic>=2.9.2",
    "loguru>=0.7.2",
    "langchain>=0.3.3",
@ -25,31 +24,23 @@ dependencies = [
    "tiktoken>=0.8.0",
    "streamlit-monaco>=0.1.3",
    "langgraph-checkpoint-sqlite>=2.0.0",
-    "pymupdf==1.24.11",
-    "python-magic>=0.4.27",
-    "langdetect>=1.0.9",
-    "youtube-transcript-api>=0.6.2",
    "openai>=1.52.0",
-    "pre-commit>=4.0.1",
    "langchain-community>=0.3.3",
-    "litellm>=1.50.1",
    "langchain-openai>=0.2.3",
    "langchain-anthropic>=0.2.3",
    "langchain-ollama>=0.2.0",
    "langchain-google-vertexai>=2.0.5",
    "langchain-google-genai>=2.0.1",
-    "podcastfy>=0.4",
    "tomli>=2.0.2",
-    "bs4>=0.0.2",
-    "python-docx>=1.1.2",
-    "python-pptx>=1.0.2",
-    "openpyxl>=3.1.5",
    "google-generativeai>=0.8.3",
    "langchain-groq>=0.2.1",
    "groq>=0.12.0",
    "python-dotenv>=1.0.1",
    "httpx[socks]>=0.27.0",
    "sdblpy",
+    "podcastfy",
+    "nest-asyncio>=1.6.0",
+    "content-core>=1.0.0",
 ]

 [tool.setuptools]
@ -63,12 +54,19 @@ dev = [
    "mypy>=1.11.1",
    "types-requests>=2.32.0.20241016",
    "ipywidgets>=8.1.5",
+    "pre-commit>=4.0.1",
 ]

 [build-system]
 requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"

+[dependency-groups]
+dev = [
+    "pre-commit>=4.1.0",
+    "watchdog>=6.0.0",
+]
+
 [tool.isort]
 profile = "black"
 line_length = 88
@ -82,3 +80,4 @@ ignore = ["E501"]

 [tool.uv.sources]
 sdblpy = { git = "https://github.com/lfnovo/surreal-lite-py" }
+podcastfy = { git = "https://github.com/lfnovo/podcastfy" }
--- a/uv.lock
+++ b/uv.lock