Merge pull request #19 from lfnovo/better_search

Better search, citations, new model providers
This commit is contained in:
Luis Novo 2024-11-13 22:11:03 -03:00 committed by GitHub
commit d7a8bbf435
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
68 changed files with 2462 additions and 1459 deletions

View file

@ -13,3 +13,6 @@ docker-compose*
docs/
surreal-data/
temp/
*.env
.mypy_cache/
.ruff_cache/

View file

@ -7,6 +7,7 @@ OPENAI_API_KEY=
# ANTHROPIC_API_KEY=
# GEMINI
# this is the best model for long context and podcast generation
# GEMINI_API_KEY=
# VERTEXAI
@ -20,6 +21,12 @@ OPENAI_API_KEY=
# OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
# OPENROUTER_API_KEY=
# GROQ
# GROQ_API_KEY=
# XAI
# XAI_API_KEY=
# ELEVENLABS
# Used only by the podcast feature
ELEVENLABS_API_KEY=

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
*.env
prompts/patterns/user/
notebooks/
data/

View file

@ -14,9 +14,10 @@ RUN pip install poetry --no-cache-dir
RUN poetry self add poetry-plugin-dotenv
RUN poetry config virtualenvs.create false
COPY . /app
COPY pyproject.toml poetry.lock /app/
RUN poetry install --only main
COPY . /app
EXPOSE 8502
RUN mkdir -p /app/data

36
Dockerfile_full Normal file
View file

@ -0,0 +1,36 @@
# Use an official Python runtime as a base image
FROM python:3.11.7-slim-bullseye
# Install system dependencies required for building certain Python packages
RUN apt-get update && apt-get install -y \
gcc \
curl wget libmagic-dev ffmpeg supervisor \
&& rm -rf /var/lib/apt/lists/*
# Install SurrealDB
RUN curl --proto '=https' --tlsv1.2 -sSf https://install.surrealdb.com | sh
# Set the working directory in the container to /app
WORKDIR /app
COPY pyproject.toml poetry.lock /app/
RUN pip install poetry --no-cache-dir
RUN poetry self add poetry-plugin-dotenv
RUN poetry config virtualenvs.create false
RUN poetry install --only main
COPY . /app
# Create supervisor configuration directory
RUN mkdir -p /etc/supervisor/conf.d
# Copy supervisor configuration file
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
EXPOSE 8502
RUN mkdir -p /app/data
# Use supervisor as the main process
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

View file

@ -52,6 +52,8 @@ docker-update-latest: docker-buildx-prepare
# Release with latest
docker-release-all: docker-release docker-update-latest
dev:
docker compose -f docker-compose.dev.yml up --build
docker compose -f docker-compose.dev.yml up --build
full:
docker compose -f docker-compose.full.yml up --build

View file

@ -35,6 +35,19 @@
</div>
## 📢 Open Notebook is under very active development
> Open Notebook is under active development! We're moving fast and making improvements every week. Your feedback is incredibly valuable to me during this exciting phase and it gives me motivation to keep improving and building this amazing tool. Please feel free to star the project if you find it useful, and don't hesitate to reach out with any questions or suggestions. I'm excited to see how you'll use it and what ideas you'll bring to the project! Let's build something amazing together! 🚀
>
> ⚠️ **API Changes**: As we optimize and enhance the project, some APIs and interfaces might change. We'll do our best to document these changes and minimize disruption.
>
> 🙏 **We Need Your Feedback**: Please try out Open Notebook and let us know what you think! Submit issues, feature requests, or just share your experience through:
> - GitHub Issues
> - Discussions
> - Pull Requests
>
> Together, we can make it even better!
<!-- TABLE OF CONTENTS -->
<details>
@ -153,6 +166,15 @@ Go to the [Usage](docs/USAGE.md) page to learn how to use all features.
## 🚀 New Features
### v0.1 - Release Candidate
- Better citations and improved search capabilities
- The "Ask" feature is much smarter now and let's you check its thinking
- Enabled support for X.AI and Groq models
- Select default transformations to apply to all content
- Save insights as custom notes
- Items are added to context by default
### v0.0.10 - Gemini podcast model
- Added the Gemini model for generating much more fluid and engaging podcasts

13
docker-compose.full.yml Normal file
View file

@ -0,0 +1,13 @@
version: '3'
services:
open_notebook_full:
build:
context: .
dockerfile: Dockerfile_full
ports:
- "8080:8502"
volumes:
- ./.docker_data/data:/app/data
- ./docker2.env:/app/.env
- ./google-credentials.json:/app/google-credentials.json

View file

@ -89,10 +89,10 @@ Go to the settings page and create your different models.
| Model Type | Supported Providers |
|------------|-----------|
| Language | OpenAI, Anthropic, Open Router, LiteLLM, Vertex AI, Vertex AI, Anthropic, Gemini, Ollama |
| Language | OpenAI, Anthropic, Open Router, LiteLLM, Vertex AI, Vertex AI, Anthropic, Gemini, Ollama, xAI, Groq |
| Embedding | OpenAI, Gemini, Vertex AI, Ollama |
| Speech to Text | OpenAI |
| Text to Speech | OpenAI, ElevenLabs |
| Speech to Text | OpenAI, Groq |
| Text to Speech | OpenAI, ElevenLabs, Gemini |
> 📝 **Notice:** For complete usage of all the features, you need to setup at least 4 models (one of each type).

134
migrations/4.surrealql Normal file
View file

@ -0,0 +1,134 @@
REMOVE FUNCTION IF EXISTS fn::text_search;
DEFINE FUNCTION IF NOT EXISTS fn::text_search($query_text: string, $match_count: int, $sources:bool, $show_notes:bool) {
let $source_title_search =
IF $sources {(
SELECT id, title,
search::highlight('`', '`', 1) as content,
id as parent_id,
math::max(search::score(1)) AS relevance
FROM source
WHERE title @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_embedding_search =
IF $sources {(
SELECT source.id as id, source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance
FROM source_embedding
WHERE content @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_full_search =
IF $sources {(
SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
FROM source
WHERE full_text @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_insight_search =
IF $sources {(
SELECT id, insight_type + " - " + (source.title OR '') as title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
FROM source_insight
WHERE content @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $note_title_search =
IF $show_notes {(
SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
FROM note
WHERE title @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $note_content_search =
IF $show_notes {(
SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
FROM note
WHERE content @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_chunk_results = array::union($source_embedding_search, $source_full_search);
let $source_asset_results = array::union($source_title_search, $source_insight_search);
let $source_results = array::union($source_chunk_results, $source_asset_results );
let $note_results = array::union($note_title_search, $note_content_search );
let $final_results = array::union($source_results, $note_results );
RETURN (select id, parent_id, title, math::max(relevance) as relevance
from $final_results where id is not None
group by id, parent_id, title ORDER BY relevance DESC LIMIT $match_count);
};
REMOVE FUNCTION IF EXISTS fn::vector_search;
DEFINE FUNCTION IF NOT EXISTS fn::vector_search($query: array<float>, $match_count: int, $sources: bool, $show_notes: bool, $min_similarity: float) {
let $source_embedding_search =
IF $sources {(
SELECT
source.id as id,
source.title as title,
content,
source.id as parent_id,
vector::similarity::cosine(embedding, $query) as similarity
FROM source_embedding
WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
ORDER BY similarity DESC
LIMIT $match_count
)}
ELSE { [] };
let $source_insight_search =
IF $sources {(
SELECT
id,
insight_type + ' - ' + (source.title OR '') as title,
content,
source.id as parent_id,
vector::similarity::cosine(embedding, $query) as similarity
FROM source_insight
WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
ORDER BY similarity DESC
LIMIT $match_count
)}
ELSE { [] };
let $note_content_search =
IF $show_notes {(
SELECT
id,
title,
content,
id as parent_id,
vector::similarity::cosine(embedding, $query) as similarity
FROM note
WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
ORDER BY similarity DESC
LIMIT $match_count
)}
ELSE { [] };
let $all_results = array::union(
array::union($source_embedding_search, $source_insight_search),
$note_content_search
);
RETURN (select id, parent_id, title, math::max(similarity) as similarity,
array::flatten(content) as matches
from $all_results where id is not None
group by id, parent_id, title ORDER BY similarity DESC LIMIT $match_count);
};

139
migrations/4_down.surrealql Normal file
View file

@ -0,0 +1,139 @@
REMOVE FUNCTION IF EXISTS fn::vector_search;
DEFINE FUNCTION IF NOT EXISTS fn::vector_search($query: array<float>, $match_count: int, $sources: bool, $show_notes: bool, $min_similarity: float) {
let $source_embedding_search =
IF $sources {(
SELECT
id,
source.title as title,
content,
source.id as parent_id,
vector::similarity::cosine(embedding, $query) as similarity
FROM source_embedding
WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
ORDER BY similarity DESC
LIMIT $match_count
)}
ELSE { [] };
let $source_insight_search =
IF $sources {(
SELECT
id,
insight_type + ' - ' + source.title as title,
content,
source.id as parent_id,
vector::similarity::cosine(embedding, $query) as similarity
FROM source_insight
WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
ORDER BY similarity DESC
LIMIT $match_count
)}
ELSE { [] };
let $note_content_search =
IF $show_notes {(
SELECT
id,
title,
content,
id as parent_id,
vector::similarity::cosine(embedding, $query) as similarity
FROM note
WHERE vector::similarity::cosine(embedding, $query) >= $min_similarity
ORDER BY similarity DESC
LIMIT $match_count
)}
ELSE { [] };
let $all_results = array::union(
array::union($source_embedding_search, $source_insight_search),
$note_content_search
);
RETURN (
SELECT
id, title, content, parent_id,
math::max(similarity) as similarity
FROM $all_results
GROUP BY id
ORDER BY similarity DESC
LIMIT $match_count
);
};
REMOVE FUNCTION IF EXISTS fn::text_search;
DEFINE FUNCTION IF NOT EXISTS fn::text_search($query_text: string, $match_count: int, $sources:bool, $show_notes:bool) {
let $source_title_search =
IF $sources {(
SELECT id, title,
search::highlight('`', '`', 1) as content,
id as parent_id,
math::max(search::score(1)) AS relevance
FROM source
WHERE title @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_embedding_search =
IF $sources {(
SELECT id as id, source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance
FROM source_embedding
WHERE content @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_full_search =
IF $sources {(
SELECT source.id as id, source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance
FROM source
WHERE full_text @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_insight_search =
IF $sources {(
SELECT id, insight_type + " - " + source.title as title, search::highlight('`', '`', 1) as content, source.id as parent_id, math::max(search::score(1)) AS relevance
FROM source_insight
WHERE content @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $note_title_search =
IF $show_notes {(
SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
FROM note
WHERE title @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $note_content_search =
IF $show_notes {(
SELECT id, title, search::highlight('`', '`', 1) as content, id as parent_id, math::max(search::score(1)) AS relevance
FROM note
WHERE content @1@ $query_text
GROUP BY id)}
ELSE { [] };
let $source_chunk_results = array::union($source_embedding_search, $source_full_search);
let $source_asset_results = array::union($source_title_search, $source_insight_search);
let $source_results = array::union($source_chunk_results, $source_asset_results );
let $note_results = array::union($note_title_search, $note_content_search );
let $final_results = array::union($source_results, $note_results );
RETURN (SELECT id, title, content, parent_id, math::max(relevance) as relevance from $final_results
where id is not None
group by id, title, content, parent_id ORDER BY relevance DESC LIMIT $match_count);
};

View file

@ -27,7 +27,3 @@ LANGGRAPH_CHECKPOINT_FILE = f"{sqlite_folder}/checkpoints.sqlite"
# UPLOADS FOLDER
UPLOADS_FOLDER = f"{DATA_FOLDER}/uploads"
os.makedirs(UPLOADS_FOLDER, exist_ok=True)
# PODCASTS FOLDER
PODCASTS_FOLDER = f"{DATA_FOLDER}/podcasts"
os.makedirs(PODCASTS_FOLDER, exist_ok=True)

View file

@ -22,6 +22,7 @@ class MigrationManager:
Migration.from_file("migrations/1.surrealql"),
Migration.from_file("migrations/2.surrealql"),
Migration.from_file("migrations/3.surrealql"),
Migration.from_file("migrations/4.surrealql"),
]
self.down_migrations = [
Migration.from_file(
@ -29,6 +30,7 @@ class MigrationManager:
),
Migration.from_file("migrations/2_down.surrealql"),
Migration.from_file("migrations/3_down.surrealql"),
Migration.from_file("migrations/4_down.surrealql"),
]
self.runner = MigrationRunner(
up_migrations=self.up_migrations,
@ -53,14 +55,14 @@ class MigrationManager:
def run_migration_up(self):
current_version = self.get_current_version()
logger.debug(f"Current version before migration: {current_version}")
logger.info(f"Current version before migration: {current_version}")
if self.needs_migration:
try:
self.runner.run()
new_version = self.get_current_version()
logger.debug(f"Migration successful. New version: {new_version}")
logger.info(f"Migration successful. New version: {new_version}")
except Exception as e:
logger.error(f"Migration failed: {str(e)}")
else:
logger.debug("Database is already at the latest version")
logger.info("Database is already at the latest version")

View file

@ -30,7 +30,7 @@ def repo_query(query_str: str, vars: Optional[Dict[str, Any]] = None):
result = connection.query(query_str, vars)
return result
except Exception as e:
logger.critical(f"Query: {query_str}, Variables: {vars}")
logger.critical(f"Query: {query_str}")
logger.exception(e)
raise
@ -62,7 +62,5 @@ def repo_relate(source: str, relationship: str, target: str):
# "target": target,
# # "content": {}, # You can add properties to the relation here if needed
# }
logger.debug(f"Executing RELATE query: {query}")
result = repo_query(query)
logger.debug(f"RELATE query result: {result}")
return result

View file

@ -112,8 +112,6 @@ class ObjectModel(BaseModel):
from open_notebook.domain.models import model_manager
from open_notebook.models import EmbeddingModel
EMBEDDING_MODEL: EmbeddingModel = model_manager.embedding_model
try:
self.model_validate(self.model_dump(), strict=True)
data = self._prepare_save_data()
@ -122,11 +120,11 @@ class ObjectModel(BaseModel):
if self.needs_embedding():
embedding_content = self.get_embedding_content()
if embedding_content:
EMBEDDING_MODEL: EmbeddingModel = model_manager.embedding_model
data["embedding"] = EMBEDDING_MODEL.embed(embedding_content)
if self.id is None:
data["created"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
logger.debug("Creating new record")
repo_result = repo_create(self.__class__.table_name, data)
else:
data["created"] = (
@ -204,9 +202,13 @@ class RecordModel(BaseModel):
result = repo_query(f"SELECT * FROM {self.record_id};")
if result:
result = result[0]
for key, value in result.items():
if hasattr(self, key):
setattr(self, key, value)
else:
repo_create(self.record_id, {})
result = {}
for key, value in result.items():
if hasattr(self, key):
setattr(self, key, value)
return self
def update(self, data):

View file

@ -68,7 +68,9 @@ class ModelManager:
)
return cached_model
assert model_id, "Model ID cannot be empty"
if not model_id:
return None
model: Model = Model.get(model_id)
if not model:
@ -160,9 +162,6 @@ class ModelManager:
elif model_type == "large_context":
model_id = self.defaults.large_context_model
if not model_id:
raise ValueError(f"No default model configured for type: {model_type}")
return self.get_model(model_id, **kwargs)
def clear_cache(self):

View file

@ -1,4 +1,5 @@
from typing import Any, ClassVar, Dict, List, Literal, Optional
from concurrent.futures import ThreadPoolExecutor
from typing import Any, ClassVar, Dict, List, Literal, Optional, Tuple
from loguru import logger
from pydantic import BaseModel, Field, field_validator
@ -129,6 +130,16 @@ class SourceInsight(ObjectModel):
logger.exception(e)
raise DatabaseOperationError(e)
def save_as_note(self, notebook_id: str = None) -> Any:
note = Note(
title=f"{self.insight_type} from source {self.source.title}",
content=self.content,
)
note.save()
if notebook_id:
note.add_to_notebook(notebook_id)
return note
class Source(ObjectModel):
table_name: ClassVar[str] = "source"
@ -140,15 +151,16 @@ class Source(ObjectModel):
def get_context(
self, context_size: Literal["short", "long"] = "short"
) -> Dict[str, Any]:
insights = [insight.model_dump() for insight in self.insights]
if context_size == "long":
return dict(
id=self.id,
title=self.title,
insights=[insight.model_dump() for insight in self.insights],
insights=insights,
full_text=self.full_text,
)
else:
return dict(id=self.id, title=self.title, insights=self.insights)
return dict(id=self.id, title=self.title, insights=insights)
@property
def embedded_chunks(self) -> int:
@ -186,54 +198,67 @@ class Source(ObjectModel):
return self.relate("reference", notebook_id)
def vectorize(self) -> None:
logger.info(f"Starting vectorization for source {self.id}")
EMBEDDING_MODEL = model_manager.embedding_model
try:
if not self.full_text:
logger.warning(f"No text to vectorize for source {self.id}")
return
chunks = split_text(
self.full_text,
)
logger.debug(f"Split into {len(chunks)} chunks")
chunk_count = len(chunks)
logger.info(f"Split into {chunk_count} chunks for source {self.id}")
# future: we can increase the batch size after surreal launches their new SDK
for i, chunk in enumerate(chunks):
if chunk_count == 0:
logger.warning("No chunks created after splitting")
return
def process_chunk(args: Tuple[int, str]) -> Tuple[int, List[float], str]:
idx, chunk = args
logger.debug(f"Processing chunk {idx}/{chunk_count}")
try:
embedding = EMBEDDING_MODEL.embed(chunk)
cleaned_content = surreal_clean(chunk)
logger.debug(f"Successfully processed chunk {idx}")
return (idx, embedding, cleaned_content)
except Exception as e:
logger.error(f"Error processing chunk {idx}: {str(e)}")
raise
# Process chunks in parallel while preserving order
logger.info("Starting parallel processing of chunks")
with ThreadPoolExecutor(max_workers=8) as executor:
# Create list of (index, chunk) tuples
chunk_tasks = list(enumerate(chunks))
# Process all chunks in parallel and get results
results = list(executor.map(process_chunk, chunk_tasks))
logger.info(f"Parallel processing complete. Got {len(results)} results")
# Insert results in order (they're already ordered by index)
for idx, embedding, content in results:
logger.debug(f"Inserting chunk {idx} into database")
repo_query(
f"""
CREATE source_embedding CONTENT {{
"source": {self.id},
"order": {i},
"order": {idx},
"content": $content,
"embedding": {EMBEDDING_MODEL.embed(chunk)},
"embedding": {embedding},
}};""",
{"content": surreal_clean(chunk)},
{"content": content},
)
logger.info(f"Vectorization complete for source {self.id}")
except Exception as e:
logger.error(f"Error vectorizing source {self.id}: {str(e)}")
logger.exception(e)
raise DatabaseOperationError(e)
# @classmethod
# def search(cls, query: str) -> List[Dict[str, Any]]:
# if not query:
# raise InvalidInputError("Search query cannot be empty")
# try:
# result = repo_query(
# """
# SELECT * omit full_text
# FROM source
# WHERE string::lowercase(title) CONTAINS $query or title @@ $query
# OR string::lowercase(summary) CONTAINS $query or summary @@ $query
# OR string::lowercase(full_text) CONTAINS $query or full_text @@ $query
# """,
# {"query": query},
# )
# return result
# except Exception as e:
# logger.error(f"Error searching sources: {str(e)}")
# logger.exception(e)
# raise DatabaseOperationError("Failed to search sources")
def add_insight(self, insight_type: str, content: str) -> Any:
EMBEDDING_MODEL = model_manager.embedding_model
@ -309,7 +334,8 @@ def text_search(keyword: str, results: int, source: bool = True, note: bool = Tr
try:
results = repo_query(
"""
SELECT * FROM fn::text_search($keyword, $results, $source, $note);
select *
from fn::text_search($keyword, $results, $source, $note)
""",
{"keyword": keyword, "results": results, "source": source, "note": note},
)
@ -320,7 +346,13 @@ def text_search(keyword: str, results: int, source: bool = True, note: bool = Tr
raise DatabaseOperationError(e)
def vector_search(keyword: str, results: int, source: bool = True, note: bool = True):
def vector_search(
keyword: str,
results: int,
source: bool = True,
note: bool = True,
minimum_score=0.2,
):
if not keyword:
raise InvalidInputError("Search keyword cannot be empty")
try:
@ -328,131 +360,18 @@ def vector_search(keyword: str, results: int, source: bool = True, note: bool =
embed = EMBEDDING_MODEL.embed(keyword)
results = repo_query(
"""
SELECT * FROM fn::vector_search($embed, $results, $source, $note, 0.15);
SELECT * FROM fn::vector_search($embed, $results, $source, $note, $minimum_score);
""",
{"embed": embed, "results": results, "source": source, "note": note},
{
"embed": embed,
"results": results,
"source": source,
"note": note,
"minimum_score": minimum_score,
},
)
return results
except Exception as e:
logger.error(f"Error performing vector search: {str(e)}")
logger.exception(e)
raise DatabaseOperationError(e)
def hybrid_search(
keyword_search: List[str],
embed_search: List[str],
results: int = 50,
source: bool = True,
note: bool = True,
max_chunks_per_doc: int = 3,
min_results_per_query: int = 3,
) -> Dict[str, List[Dict]]:
if not keyword_search and not embed_search:
raise InvalidInputError("At least one search term required")
# Process keyword searches
all_keyword_results = {} # Dictionary to store results per keyword
for keyword in keyword_search:
try:
search_results = text_search(keyword, results, source, note)
# Sort results by relevance
sorted_results = sorted(
search_results, key=lambda x: x.get("relevance", 0), reverse=True
)
# Group by parent_id and limit chunks per document
seen_parent_ids = {}
filtered_results = []
for result in sorted_results:
parent_id = result["parent_id"]
if parent_id not in seen_parent_ids:
seen_parent_ids[parent_id] = 1
filtered_results.append(result)
elif seen_parent_ids[parent_id] < max_chunks_per_doc:
seen_parent_ids[parent_id] += 1
filtered_results.append(result)
all_keyword_results[keyword] = filtered_results
except Exception as e:
logger.warning(f"Error in keyword search for term '{keyword}': {str(e)}")
continue
# Ensure minimum results from each keyword query
keyword_results = []
remaining_slots = results
# First pass: add minimum results from each query
for keyword, query_results in all_keyword_results.items():
keyword_results.extend(query_results[:min_results_per_query])
remaining_slots -= min(len(query_results), min_results_per_query)
# Second pass: fill remaining slots with best results
all_remaining = []
for keyword, query_results in all_keyword_results.items():
all_remaining.extend(query_results[min_results_per_query:])
# Sort remaining by relevance and add until we hit the limit
all_remaining = sorted(
all_remaining, key=lambda x: x.get("relevance", 0), reverse=True
)
seen_ids = {r["id"] for r in keyword_results}
for result in all_remaining:
if remaining_slots <= 0:
break
if result["id"] not in seen_ids:
keyword_results.append(result)
seen_ids.add(result["id"])
remaining_slots -= 1
# Process vector searches with the same approach
all_vector_results = {} # Dictionary to store results per embedding
for embed in embed_search:
try:
search_results = vector_search(embed, results, source, note)
# Sort results by similarity
sorted_results = sorted(
search_results, key=lambda x: x.get("similarity", 0), reverse=True
)
# Group by parent_id and limit chunks per document
seen_parent_ids = {}
filtered_results = []
for result in sorted_results:
parent_id = result["parent_id"]
if parent_id not in seen_parent_ids:
seen_parent_ids[parent_id] = 1
filtered_results.append(result)
elif seen_parent_ids[parent_id] < max_chunks_per_doc:
seen_parent_ids[parent_id] += 1
filtered_results.append(result)
all_vector_results[embed] = filtered_results
except Exception as e:
logger.warning(f"Error in vector search for term '{embed}': {str(e)}")
continue
# Ensure minimum results from each vector query
vector_results = []
remaining_slots = results
# First pass: add minimum results from each query
for embed, query_results in all_vector_results.items():
vector_results.extend(query_results[:min_results_per_query])
remaining_slots -= min(len(query_results), min_results_per_query)
# Second pass: fill remaining slots with best results
all_remaining = []
for embed, query_results in all_vector_results.items():
all_remaining.extend(query_results[min_results_per_query:])
# Sort remaining by similarity and add until we hit the limit
all_remaining = sorted(
all_remaining, key=lambda x: x.get("similarity", 0), reverse=True
)
seen_ids = {r["id"] for r in vector_results}
for result in all_remaining:
if remaining_slots <= 0:
break
if result["id"] not in seen_ids:
vector_results.append(result)
seen_ids.add(result["id"])
remaining_slots -= 1
return {"keyword_results": keyword_results, "vector_results": vector_results}

View file

@ -0,0 +1,19 @@
from typing import ClassVar, List, Optional
import yaml
from pydantic import Field
from open_notebook.domain.base import RecordModel
class Transformation:
@classmethod
def get_all(cls):
with open("transformations.yaml", "r") as file:
transformations = yaml.safe_load(file)
return transformations
class DefaultTransformations(RecordModel):
record_id: ClassVar[str] = "open_notebook:default_transformations"
source_insights: Optional[List[str]] = Field(default_factory=list)

126
open_notebook/graphs/ask.py Normal file
View file

@ -0,0 +1,126 @@
import operator
from typing import Annotated, List
from langchain_core.output_parsers.pydantic import PydanticOutputParser
from langchain_core.runnables import (
RunnableConfig,
)
from langgraph.graph import END, START, StateGraph
from langgraph.types import Send
from pydantic import BaseModel, Field
from typing_extensions import TypedDict
from open_notebook.domain.notebook import vector_search
from open_notebook.graphs.utils import provision_langchain_model
from open_notebook.prompter import Prompter
class SubGraphState(TypedDict):
question: str
term: str
# type: Literal["text", "vector"]
instructions: str
results: dict
answer: str
class Search(BaseModel):
term: str
# type: Literal["text", "vector"] = Field(
# description="The type of search. Use 'text' for keyword search and 'vector' for semantic search. If you are using text, search always for a single word"
# )
instructions: str = Field(
description="Tell the answeting LLM what information you need extracted from this search"
)
class Strategy(BaseModel):
reasoning: str
searches: List[Search] = Field(
default_factory=list,
description="You can add up to five searches to this strategy",
)
class ThreadState(TypedDict):
question: str
strategy: Strategy
answers: Annotated[list, operator.add]
final_answer: str
async def call_model_with_messages(state: ThreadState, config: RunnableConfig) -> dict:
parser = PydanticOutputParser(pydantic_object=Strategy)
system_prompt = Prompter(prompt_template="ask/entry", parser=parser).render(
data=state
)
model = provision_langchain_model(
system_prompt,
config.get("configurable", {}).get("strategy_model"),
"tools",
max_tokens=2000,
)
# model = model.bind_tools(tools)
ai_message = (model | parser).invoke(system_prompt)
return {"strategy": ai_message}
async def trigger_queries(state: ThreadState, config: RunnableConfig):
return [
Send(
"provide_answer",
{
"question": state["question"],
"instructions": s.instructions,
"term": s.term,
# "type": s.type,
},
)
for s in state["strategy"].searches
]
async def provide_answer(state: SubGraphState, config: RunnableConfig) -> dict:
payload = state
# if state["type"] == "text":
# results = text_search(state["term"], 10, True, True)
# else:
results = vector_search(state["term"], 10, True, True)
if len(results) == 0:
return {"answers": []}
payload["results"] = results
ids = [r["id"] for r in results]
payload["ids"] = ids
system_prompt = Prompter(prompt_template="ask/query_process").render(data=payload)
model = provision_langchain_model(
system_prompt,
config.get("configurable", {}).get("answer_model"),
"tools",
max_tokens=2000,
)
ai_message = model.invoke(system_prompt)
return {"answers": [ai_message.content]}
async def write_final_answer(state: ThreadState, config: RunnableConfig) -> dict:
system_prompt = Prompter(prompt_template="ask/final_answer").render(data=state)
model = provision_langchain_model(
system_prompt,
config.get("configurable", {}).get("final_answer_model"),
"tools",
max_tokens=2000,
)
ai_message = model.invoke(system_prompt)
return {"final_answer": ai_message.content}
agent_state = StateGraph(ThreadState)
agent_state.add_node("agent", call_model_with_messages)
agent_state.add_node("provide_answer", provide_answer)
agent_state.add_node("write_final_answer", write_final_answer)
agent_state.add_edge(START, "agent")
agent_state.add_conditional_edges("agent", trigger_queries, ["provide_answer"])
agent_state.add_edge("provide_answer", "write_final_answer")
agent_state.add_edge("write_final_answer", END)
graph = agent_state.compile()

View file

@ -1,6 +1,7 @@
import sqlite3
from typing import Annotated, Optional
from langchain_core.messages import SystemMessage
from langchain_core.runnables import (
RunnableConfig,
)
@ -24,8 +25,13 @@ class ThreadState(TypedDict):
def call_model_with_messages(state: ThreadState, config: RunnableConfig) -> dict:
system_prompt = Prompter(prompt_template="chat").render(data=state)
payload = [system_prompt] + state.get("messages", [])
model = provision_langchain_model(str(payload), config, "chat", max_tokens=2000)
payload = [SystemMessage(content=system_prompt)] + state.get("messages", [])
model = provision_langchain_model(
str(payload),
config.get("configurable", {}).get("model_id"),
"chat",
max_tokens=2000,
)
ai_message = model.invoke(payload)
return {"messages": ai_message}

View file

@ -1,4 +1,5 @@
import os
from typing import Any, Dict
import magic
from langgraph.graph import END, START, StateGraph
@ -14,14 +15,14 @@ from open_notebook.graphs.content_processing.pdf import (
SUPPORTED_FITZ_TYPES,
extract_pdf,
)
from open_notebook.graphs.content_processing.state import SourceState
from open_notebook.graphs.content_processing.state import ContentState
from open_notebook.graphs.content_processing.text import extract_txt
from open_notebook.graphs.content_processing.url import extract_url, url_provider
from open_notebook.graphs.content_processing.video import extract_best_audio_from_video
from open_notebook.graphs.content_processing.youtube import extract_youtube_transcript
def source_identification(state: SourceState):
async def source_identification(state: ContentState) -> Dict[str, str]:
"""
Identify the content source based on parameters
"""
@ -37,7 +38,7 @@ def source_identification(state: SourceState):
return {"source_type": doc_type}
def file_type(state: SourceState):
async def file_type(state: ContentState) -> Dict[str, Any]:
"""
Identify the file using python-magic
"""
@ -45,10 +46,11 @@ def file_type(state: SourceState):
file_path = state.get("file_path")
if file_path is not None:
return_dict["identified_type"] = magic.from_file(file_path, mime=True)
return_dict["title"] = os.path.basename(file_path)
return return_dict
def file_type_edge(data: SourceState):
async def file_type_edge(data: ContentState) -> str:
assert data.get("identified_type"), "Type not identified"
identified_type = data["identified_type"]
@ -68,7 +70,7 @@ def file_type_edge(data: SourceState):
)
def delete_file(data: SourceState):
async def delete_file(data: ContentState) -> Dict[str, Any]:
if data.get("delete_source"):
logger.debug(f"Deleting file: {data.get('file_path')}")
file_path = data.get("file_path")
@ -80,9 +82,21 @@ def delete_file(data: SourceState):
logger.warning(f"File not found while trying to delete: {file_path}")
else:
logger.debug("Not deleting file")
return {}
workflow = StateGraph(SourceState)
async def url_type_router(x: ContentState) -> str:
return x.get("identified_type", "")
async def source_type_router(x: ContentState) -> str:
return x.get("source_type", "")
# Create workflow
workflow = StateGraph(ContentState)
# Add nodes
workflow.add_node("source", source_identification)
workflow.add_node("url_provider", url_provider)
workflow.add_node("file_type", file_type)
@ -94,10 +108,12 @@ workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video
workflow.add_node("extract_audio", extract_audio)
workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
workflow.add_node("delete_file", delete_file)
# Add edges
workflow.add_edge(START, "source")
workflow.add_conditional_edges(
"source",
lambda x: x.get("source_type"),
source_type_router,
{
"url": "url_provider",
"file": "file_type",
@ -110,7 +126,7 @@ workflow.add_conditional_edges(
)
workflow.add_conditional_edges(
"url_provider",
lambda x: x.get("identified_type"),
url_type_router,
{"article": "extract_url", "youtube": "extract_youtube_transcript"},
)
workflow.add_edge("url_provider", END)
@ -124,4 +140,6 @@ workflow.add_edge("extract_office_content", "delete_file")
workflow.add_edge("extract_best_audio_from_video", "extract_audio")
workflow.add_edge("extract_audio", "delete_file")
workflow.add_edge("delete_file", END)
# Compile graph
graph = workflow.compile()

View file

@ -1,100 +1,114 @@
import asyncio
import os
from functools import partial
from math import ceil
from loguru import logger
from pydub import AudioSegment
from open_notebook.domain.models import model_manager
from open_notebook.graphs.content_processing.state import SourceState
from open_notebook.graphs.content_processing.state import ContentState
# todo: remove reference to model_manager
# future: parallelize the transcription process
def split_audio(input_file, segment_length_minutes=15, output_prefix=None):
async def split_audio(input_file, segment_length_minutes=15, output_prefix=None):
"""
Split an audio file into segments of specified length.
Args:
input_file (str): Path to the input audio file
segment_length_minutes (int): Length of each segment in minutes
output_dir (str): Directory to save the segments (defaults to input file's directory)
output_prefix (str): Prefix for output files (defaults to input filename)
Returns:
list: List of paths to the created segment files
Split an audio file into segments asynchronously.
"""
# Convert input file to absolute path
input_file = os.path.abspath(input_file)
output_dir = os.path.dirname(input_file)
os.makedirs(output_dir, exist_ok=True)
def _split(input_file, segment_length_minutes, output_prefix):
# Convert input file to absolute path
input_file_abs = os.path.abspath(input_file)
output_dir = os.path.dirname(input_file_abs)
os.makedirs(output_dir, exist_ok=True)
# Set up output prefix
if output_prefix is None:
output_prefix = os.path.splitext(os.path.basename(input_file))[0]
# Set up output prefix
if output_prefix is None:
output_prefix = os.path.splitext(os.path.basename(input_file_abs))[0]
# Load the audio file
audio = AudioSegment.from_file(input_file)
# Load the audio file
audio = AudioSegment.from_file(input_file_abs)
# Calculate segment length in milliseconds
segment_length_ms = segment_length_minutes * 60 * 1000
# Calculate segment length in milliseconds
segment_length_ms = segment_length_minutes * 60 * 1000
# Calculate number of segments
total_segments = ceil(len(audio) / segment_length_ms)
logger.debug(f"Splitting file: {input_file} into {total_segments} segments")
# Calculate number of segments
total_segments = ceil(len(audio) / segment_length_ms)
logger.debug(f"Splitting file: {input_file_abs} into {total_segments} segments")
# List to store output file paths
output_files = []
output_files = []
# Split the audio into segments
for i in range(total_segments):
# Calculate start and end times for this segment
start_time = i * segment_length_ms
end_time = min((i + 1) * segment_length_ms, len(audio))
# Split the audio into segments
for i in range(total_segments):
start_time = i * segment_length_ms
end_time = min((i + 1) * segment_length_ms, len(audio))
# Extract segment
segment = audio[start_time:end_time]
# Extract segment
segment = audio[start_time:end_time]
# Generate output filename
# Format: prefix_001.mp3 (padding with zeros ensures correct ordering)
output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
output_path = os.path.join(output_dir, output_filename)
# Generate output filename
output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
output_path = os.path.join(output_dir, output_filename)
# Export segment
segment.export(output_path, format="mp3")
# Export segment
segment.export(output_path, format="mp3")
output_files.append(output_path)
output_files.append(output_path)
logger.debug(f"Exported segment {i+1}/{total_segments}: {output_filename}")
# Optional progress indication
logger.debug(f"Exported segment {i+1}/{total_segments}: {output_filename}")
return output_files
return output_files
# Run CPU-bound audio processing in thread pool
return await asyncio.get_event_loop().run_in_executor(
None, partial(_split, input_file, segment_length_minutes, output_prefix)
)
def extract_audio(data: SourceState):
async def transcribe_audio_segment(audio_file, model):
"""Transcribe a single audio segment asynchronously"""
def _transcribe(audio_file, model):
return model.transcribe(audio_file)
return await asyncio.get_event_loop().run_in_executor(
None, partial(_transcribe, audio_file, model)
)
async def extract_audio(data: ContentState):
SPEECH_TO_TEXT_MODEL = model_manager.speech_to_text
input_audio_path = data.get("file_path")
audio_files = []
try:
audio_files = split_audio(input_audio_path)
transcriptions = []
# Split audio into segments
audio_files = await split_audio(input_audio_path)
for audio_file in audio_files:
transcriptions.append(SPEECH_TO_TEXT_MODEL.transcribe(audio_file))
# Transcribe all segments concurrently
transcribe_tasks = [
transcribe_audio_segment(audio_file, SPEECH_TO_TEXT_MODEL)
for audio_file in audio_files
]
transcriptions = await asyncio.gather(*transcribe_tasks)
return {"content": " ".join(transcriptions)}
except Exception as e:
logger.error(f"Error transcribing audio: {str(e)}")
logger.exception(e)
raise # Re-raise the exception after logging
raise
finally:
for file in audio_files:
try:
os.remove(file)
except OSError as e:
logger.error(f"Error removing temporary file {file}: {str(e)}")
# Clean up temporary files
def _cleanup(files):
for file in files:
try:
os.remove(file)
except OSError as e:
logger.error(f"Error removing temporary file {file}: {str(e)}")
await asyncio.get_event_loop().run_in_executor(
None, partial(_cleanup, audio_files)
)

View file

@ -1,9 +1,12 @@
import asyncio
from functools import partial
from docx import Document
from loguru import logger
from openpyxl import load_workbook
from pptx import Presentation
from open_notebook.graphs.content_processing.state import SourceState
from open_notebook.graphs.content_processing.state import ContentState
SUPPORTED_OFFICE_TYPES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -12,252 +15,284 @@ SUPPORTED_OFFICE_TYPES = [
]
def extract_docx_content_detailed(file_path):
try:
doc = Document(file_path)
content = []
async def extract_docx_content_detailed(file_path):
"""Extract content from DOCX file"""
for paragraph in doc.paragraphs:
if not paragraph.text.strip():
continue
def _extract():
try:
doc = Document(file_path)
content = []
style = paragraph.style.name if paragraph.style else "Normal"
text = paragraph.text.strip()
for paragraph in doc.paragraphs:
if not paragraph.text.strip():
continue
# Get paragraph formatting
p_format = paragraph.paragraph_format
indent = p_format.left_indent or 0
style = paragraph.style.name if paragraph.style else "Normal"
text = paragraph.text.strip()
# Convert indent to spaces (1 level = 4 spaces)
indent_level = 0
if hasattr(indent, "pt"):
indent_level = int(indent.pt / 72) # 72 points = 1 inch
indent_spaces = " " * (indent_level * 4)
# Get paragraph formatting
p_format = paragraph.paragraph_format
indent = p_format.left_indent or 0
# Handle different types of formatting
if "Heading" in style:
level = style[-1] if style[-1].isdigit() else "1"
heading_marks = "#" * int(level)
content.append(f"\n{heading_marks} {text}\n")
# Convert indent to spaces (1 level = 4 spaces)
indent_level = 0
if hasattr(indent, "pt"):
indent_level = int(indent.pt / 72) # 72 points = 1 inch
indent_spaces = " " * (indent_level * 4)
# Handle bullet points
elif (
paragraph.style
and hasattr(paragraph.style, "name")
and paragraph.style.name.startswith("List")
):
# Numbered list
if (
hasattr(paragraph._p, "pPr")
and paragraph._p.pPr is not None
and hasattr(paragraph._p.pPr, "numPr")
and paragraph._p.pPr.numPr is not None
# Handle different types of formatting
if "Heading" in style:
level = style[-1] if style[-1].isdigit() else "1"
heading_marks = "#" * int(level)
content.append(f"\n{heading_marks} {text}\n")
# Handle bullet points
elif (
paragraph.style
and hasattr(paragraph.style, "name")
and paragraph.style.name.startswith("List")
):
# Try to get the actual number
try:
if (
hasattr(paragraph._p.pPr.numPr, "numId")
and paragraph._p.pPr.numPr.numId is not None
and hasattr(paragraph._p.pPr.numPr.numId, "val")
):
number = paragraph._p.pPr.numPr.numId.val
content.append(f"{indent_spaces}{number}. {text}")
else:
# Numbered list
if (
hasattr(paragraph._p, "pPr")
and paragraph._p.pPr is not None
and hasattr(paragraph._p.pPr, "numPr")
and paragraph._p.pPr.numPr is not None
):
# Try to get the actual number
try:
if (
hasattr(paragraph._p.pPr.numPr, "numId")
and paragraph._p.pPr.numPr.numId is not None
and hasattr(paragraph._p.pPr.numPr.numId, "val")
):
number = paragraph._p.pPr.numPr.numId.val
content.append(f"{indent_spaces}{number}. {text}")
else:
content.append(f"{indent_spaces}1. {text}")
except Exception:
content.append(f"{indent_spaces}1. {text}")
except Exception:
content.append(f"{indent_spaces}1. {text}")
# Bullet list
else:
content.append(f"{indent_spaces}* {text}")
else:
# Handle text formatting
formatted_text = []
for run in paragraph.runs:
if run.bold:
formatted_text.append(f"**{run.text}**")
elif run.italic:
formatted_text.append(f"*{run.text}*")
# Bullet list
else:
formatted_text.append(run.text)
content.append(f"{indent_spaces}* {text}")
content.append(f"{indent_spaces}{''.join(formatted_text)}")
else:
# Handle text formatting
formatted_text = []
for run in paragraph.runs:
if run.bold:
formatted_text.append(f"**{run.text}**")
elif run.italic:
formatted_text.append(f"*{run.text}*")
else:
formatted_text.append(run.text)
return "\n\n".join(content)
content.append(f"{indent_spaces}{''.join(formatted_text)}")
except Exception as e:
logger.error(f"Failed to extract DOCX content: {e}")
return None
return "\n\n".join(content)
except Exception as e:
logger.error(f"Failed to extract DOCX content: {e}")
return None
return await asyncio.get_event_loop().run_in_executor(None, _extract)
# Example of usage with metadata
def get_docx_info(file_path):
try:
doc = Document(file_path)
async def get_docx_info(file_path):
"""Get DOCX metadata and content"""
# Extract core properties if available
core_props = {
"author": doc.core_properties.author,
"created": doc.core_properties.created,
"modified": doc.core_properties.modified,
"title": doc.core_properties.title,
"subject": doc.core_properties.subject,
"keywords": doc.core_properties.keywords,
"category": doc.core_properties.category,
"comments": doc.core_properties.comments,
}
def _get_info():
try:
doc = Document(file_path)
# Get document content
content = extract_docx_content_detailed(file_path)
# Extract core properties if available
core_props = {
"author": doc.core_properties.author,
"created": doc.core_properties.created,
"modified": doc.core_properties.modified,
"title": doc.core_properties.title,
"subject": doc.core_properties.subject,
"keywords": doc.core_properties.keywords,
"category": doc.core_properties.category,
"comments": doc.core_properties.comments,
}
# Get document statistics
stats = {
"paragraph_count": len(doc.paragraphs),
"word_count": sum(
len(p.text.split()) for p in doc.paragraphs if p.text.strip()
),
"character_count": sum(
len(p.text) for p in doc.paragraphs if p.text.strip()
),
}
# Get document content
content = extract_docx_content_detailed(file_path)
return {"metadata": core_props, "content": content, "statistics": stats}
# Get document statistics
stats = {
"paragraph_count": len(doc.paragraphs),
"word_count": sum(
len(p.text.split()) for p in doc.paragraphs if p.text.strip()
),
"character_count": sum(
len(p.text) for p in doc.paragraphs if p.text.strip()
),
}
except Exception as e:
logger.error(f"Failed to get DOCX info: {e}")
return None
return {"metadata": core_props, "content": content, "statistics": stats}
except Exception as e:
logger.error(f"Failed to get DOCX info: {e}")
return None
return await asyncio.get_event_loop().run_in_executor(None, _get_info)
def extract_pptx_content(file_path):
try:
prs = Presentation(file_path)
content = []
async def extract_pptx_content(file_path):
"""Extract content from PPTX file"""
for slide_number, slide in enumerate(prs.slides, 1):
content.append(f"\n# Slide {slide_number}\n")
def _extract():
try:
prs = Presentation(file_path)
content = []
# Extract title
if slide.shapes.title:
content.append(f"## {slide.shapes.title.text}\n")
for slide_number, slide in enumerate(prs.slides, 1):
content.append(f"\n# Slide {slide_number}\n")
# Extract text from all shapes
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
if shape != slide.shapes.title: # Skip title as it's already added
content.append(shape.text.strip())
# Extract title
if slide.shapes.title:
content.append(f"## {slide.shapes.title.text}\n")
return "\n\n".join(content)
# Extract text from all shapes
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
if (
shape != slide.shapes.title
): # Skip title as it's already added
content.append(shape.text.strip())
except Exception as e:
logger.error(f"Failed to extract PPTX content: {e}")
return None
return "\n\n".join(content)
except Exception as e:
logger.error(f"Failed to extract PPTX content: {e}")
return None
return await asyncio.get_event_loop().run_in_executor(None, _extract)
def extract_xlsx_content(file_path, max_rows=1000, max_cols=100):
try:
wb = load_workbook(file_path, data_only=True)
content = []
async def extract_xlsx_content(file_path, max_rows=10000, max_cols=100):
"""Extract content from XLSX file"""
for sheet in wb.sheetnames:
ws = wb[sheet]
content.append(f"\n# Sheet: {sheet}\n")
def _extract():
try:
wb = load_workbook(file_path, data_only=True)
content = []
# Get the maximum row and column with data
max_row = min(ws.max_row, max_rows)
max_col = min(ws.max_column, max_cols)
for sheet in wb.sheetnames:
ws = wb[sheet]
content.append(f"\n# Sheet: {sheet}\n")
# Create markdown table header
headers = []
for col in range(1, max_col + 1):
cell_value = ws.cell(row=1, column=col).value
headers.append(str(cell_value) if cell_value is not None else "")
# Get the maximum row and column with data
max_row = min(ws.max_row, max_rows)
max_col = min(ws.max_column, max_cols)
content.append("| " + " | ".join(headers) + " |")
content.append("| " + " | ".join(["---"] * len(headers)) + " |")
# Add table content
for row in range(2, max_row + 1):
row_data = []
# Create markdown table header
headers = []
for col in range(1, max_col + 1):
cell_value = ws.cell(row=row, column=col).value
row_data.append(str(cell_value) if cell_value is not None else "")
content.append("| " + " | ".join(row_data) + " |")
cell_value = ws.cell(row=1, column=col).value
headers.append(str(cell_value) if cell_value is not None else "")
return "\n".join(content)
content.append("| " + " | ".join(headers) + " |")
content.append("| " + " | ".join(["---"] * len(headers)) + " |")
except Exception as e:
logger.error(f"Failed to extract XLSX content: {e}")
return None
# Add table content
for row in range(2, max_row + 1):
row_data = []
for col in range(1, max_col + 1):
cell_value = ws.cell(row=row, column=col).value
row_data.append(
str(cell_value) if cell_value is not None else ""
)
content.append("| " + " | ".join(row_data) + " |")
return "\n".join(content)
except Exception as e:
logger.error(f"Failed to extract XLSX content: {e}")
return None
return await asyncio.get_event_loop().run_in_executor(None, partial(_extract))
def get_pptx_info(file_path):
try:
prs = Presentation(file_path)
async def get_pptx_info(file_path):
"""Get PPTX metadata and content"""
# Extract basic properties
props = {
"slide_count": len(prs.slides),
"title": "", # PowerPoint doesn't have built-in metadata like Word
}
def _get_info():
try:
prs = Presentation(file_path)
# Get document content
content = extract_pptx_content(file_path)
# Extract basic properties
props = {
"slide_count": len(prs.slides),
"title": "", # PowerPoint doesn't have built-in metadata like Word
}
# Get presentation statistics
stats = {
"slide_count": len(prs.slides),
"shape_count": sum(len(slide.shapes) for slide in prs.slides),
"text_frame_count": sum(
sum(1 for shape in slide.shapes if hasattr(shape, "text"))
for slide in prs.slides
),
}
# Get document content
content = extract_pptx_content(file_path)
return {"metadata": props, "content": content, "statistics": stats}
# Get presentation statistics
stats = {
"slide_count": len(prs.slides),
"shape_count": sum(len(slide.shapes) for slide in prs.slides),
"text_frame_count": sum(
sum(1 for shape in slide.shapes if hasattr(shape, "text"))
for slide in prs.slides
),
}
except Exception as e:
logger.error(f"Failed to get PPTX info: {e}")
return None
return {"metadata": props, "content": content, "statistics": stats}
except Exception as e:
logger.error(f"Failed to get PPTX info: {e}")
return None
return await asyncio.get_event_loop().run_in_executor(None, _get_info)
def get_xlsx_info(file_path):
try:
wb = load_workbook(file_path, data_only=True)
async def get_xlsx_info(file_path):
"""Get XLSX metadata and content"""
# Extract basic properties
props = {
"sheet_count": len(wb.sheetnames),
"sheets": wb.sheetnames,
"title": wb.properties.title,
"creator": wb.properties.creator,
"created": wb.properties.created,
"modified": wb.properties.modified,
}
def _get_info():
try:
wb = load_workbook(file_path, data_only=True)
# Get document content
content = extract_xlsx_content(file_path)
# Extract basic properties
props = {
"sheet_count": len(wb.sheetnames),
"sheets": wb.sheetnames,
"title": wb.properties.title,
"creator": wb.properties.creator,
"created": wb.properties.created,
"modified": wb.properties.modified,
}
# Get workbook statistics
stats = {
"sheet_count": len(wb.sheetnames),
"total_rows": sum(sheet.max_row for sheet in wb.worksheets),
"total_columns": sum(sheet.max_column for sheet in wb.worksheets),
}
# Get document content
content = extract_xlsx_content(file_path)
return {"metadata": props, "content": content, "statistics": stats}
# Get workbook statistics
stats = {
"sheet_count": len(wb.sheetnames),
"total_rows": sum(sheet.max_row for sheet in wb.worksheets),
"total_columns": sum(sheet.max_column for sheet in wb.worksheets),
}
except Exception as e:
logger.error(f"Failed to get XLSX info: {e}")
return None
return {"metadata": props, "content": content, "statistics": stats}
except Exception as e:
logger.error(f"Failed to get XLSX info: {e}")
return None
return await asyncio.get_event_loop().run_in_executor(None, _get_info)
def extract_office_content(state: SourceState):
async def extract_office_content(state: ContentState):
"""Universal function to extract content from Office files"""
assert state.get("file_path"), "No file path provided"
assert (
state.get("identified_type") in SUPPORTED_OFFICE_TYPES
), "Unsupported File Type"
file_path = state["file_path"]
doc_type = state["identified_type"]
@ -266,24 +301,23 @@ def extract_office_content(state: SourceState):
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
logger.debug("Extracting content from DOCX file")
content = extract_docx_content_detailed(file_path)
info = get_docx_info(file_path)
content = await extract_docx_content_detailed(file_path)
info = await get_docx_info(file_path)
elif (
doc_type
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
logger.debug("Extracting content from PPTX file")
content = extract_pptx_content(file_path)
info = get_pptx_info(file_path)
content = await extract_pptx_content(file_path)
info = await get_pptx_info(file_path)
elif (
doc_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
):
logger.debug("Extracting content from XLSX file")
content = extract_xlsx_content(file_path)
info = get_xlsx_info(file_path)
content = await extract_xlsx_content(file_path)
info = await get_xlsx_info(file_path)
else:
raise Exception(f"Unsupported file format: {doc_type}")
del info["content"]
return {"content": content, "metadata": info}

View file

@ -1,10 +1,11 @@
import asyncio
import re
import unicodedata
import fitz # type: ignore
from loguru import logger
from open_notebook.graphs.content_processing.state import SourceState
from open_notebook.graphs.content_processing.state import ContentState
# todo: find tables - https://pymupdf.readthedocs.io/en/latest/the-basics.html#extracting-tables-from-a-page
# todo: what else can we do to make the text more readable?
@ -114,7 +115,7 @@ def clean_pdf_text(text):
return text.strip()
def _extract_text_from_pdf(pdf_path):
async def _extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
try:
text = ""
@ -127,20 +128,39 @@ def _extract_text_from_pdf(pdf_path):
doc.close()
def extract_pdf(state: SourceState):
async def _extract_text_from_pdf(pdf_path):
"""Extract text from PDF asynchronously"""
def _extract():
doc = fitz.open(pdf_path)
try:
text = ""
logger.debug(f"Found {len(doc)} pages in PDF")
for page in doc:
text += page.get_text()
return clean_pdf_text(text)
finally:
doc.close()
# Run CPU-bound PDF processing in a thread pool
return await asyncio.get_event_loop().run_in_executor(None, _extract)
async def extract_pdf(state: ContentState):
"""
Parse the text file and print its content.
Parse the PDF file and extract its content asynchronously.
"""
return_dict = {}
assert state.get("file_path"), "No file path provided"
assert state.get("identified_type") in SUPPORTED_FITZ_TYPES, "Unsupported File Type"
if (
state.get("file_path") is not None
and state.get("identified_type") in SUPPORTED_FITZ_TYPES
):
file_path = state.get("file_path")
try:
text = _extract_text_from_pdf(file_path)
text = await _extract_text_from_pdf(file_path)
return_dict["content"] = text
except FileNotFoundError:
raise FileNotFoundError(f"File not found at {file_path}")

View file

@ -1,7 +1,7 @@
from typing_extensions import TypedDict
class SourceState(TypedDict):
class ContentState(TypedDict):
content: str
file_path: str
url: str

View file

@ -1,11 +1,13 @@
import asyncio
from loguru import logger
from open_notebook.graphs.content_processing.state import SourceState
from open_notebook.graphs.content_processing.state import ContentState
def extract_txt(state: SourceState):
async def extract_txt(state: ContentState):
"""
Parse the text file and print its content.
Parse the text file and extract its content asynchronously.
"""
return_dict = {}
if (
@ -14,12 +16,22 @@ def extract_txt(state: SourceState):
):
logger.debug(f"Extracting text from {state.get('file_path')}")
file_path = state.get("file_path")
if file_path is not None:
try:
with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
logger.debug(f"Extracted: {content[:100]}")
return_dict["content"] = content
def _read_file():
with open(file_path, "r", encoding="utf-8") as file:
return file.read()
# Run file I/O in thread pool
content = await asyncio.get_event_loop().run_in_executor(
None, _read_file
)
logger.debug(f"Extracted: {content[:100]}")
return_dict["content"] = content
except FileNotFoundError:
raise FileNotFoundError(f"File not found at {file_path}")
except Exception as e:

View file

@ -1,18 +1,18 @@
import re
from urllib.parse import urlparse
import requests # type: ignore
import aiohttp
from bs4 import BeautifulSoup, Comment
from loguru import logger
from open_notebook.graphs.content_processing.state import SourceState
from open_notebook.graphs.content_processing.state import ContentState
# future: better extraction methods
# https://github.com/buriy/python-readability
# also try readability: from readability import Document
def url_provider(state: SourceState):
def url_provider(state: ContentState):
"""
Identify the provider
"""
@ -29,7 +29,7 @@ def url_provider(state: SourceState):
return return_dict
def extract_url_bs4(url: str):
async def extract_url_bs4(url: str):
"""
Get the title and content of a URL using bs4
"""
@ -42,9 +42,10 @@ def extract_url_bs4(url: str):
if url.startswith("<!DOCTYPE html>") or url.startswith("<html"):
html_content = url
else:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
html_content = response.text
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, timeout=10) as response:
response.raise_for_status()
html_content = await response.text()
soup = BeautifulSoup(html_content, "html.parser")
@ -143,7 +144,7 @@ def extract_url_bs4(url: str):
"url": url if not url.startswith("<!DOCTYPE html>") else None,
}
except requests.exceptions.RequestException as e:
except aiohttp.ClientError as e:
logger.error(f"Failed to fetch URL {url}: {e}")
return None
except Exception as e:
@ -151,38 +152,38 @@ def extract_url_bs4(url: str):
return None
def extract_url_jina(url: str):
async def extract_url_jina(url: str):
"""
Get the content of a URL using Jina
"""
response = requests.get(f"https://r.jina.ai/{url}")
text = response.text
if text.startswith("Title:") and "\n" in text:
title_end = text.index("\n")
title = text[6:title_end].strip()
content = text[title_end + 1 :].strip()
logger.debug(
f"Processed url: {url}, found title: {title}, content: {content[:100]}..."
)
return {"title": title, "content": content}
else:
content = text
logger.debug(
f"Processed url: {url}, does not have Title prefix, returning full content: {content[:100]}..."
)
return {"content": text}
async with aiohttp.ClientSession() as session:
async with session.get(f"https://r.jina.ai/{url}") as response:
text = await response.text()
if text.startswith("Title:") and "\n" in text:
title_end = text.index("\n")
title = text[6:title_end].strip()
content = text[title_end + 1 :].strip()
logger.debug(
f"Processed url: {url}, found title: {title}, content: {content[:100]}..."
)
return {"title": title, "content": content}
else:
logger.debug(
f"Processed url: {url}, does not have Title prefix, returning full content: {text[:100]}..."
)
return {"content": text}
def extract_url(state: SourceState):
async def extract_url(state: ContentState):
assert state.get("url"), "No URL provided"
url = state["url"]
try:
result = extract_url_bs4(url)
result = await extract_url_bs4(url)
if not result or not result.get("content"):
logger.debug(
f"BS4 extraction failed for url {url}, falling back to Jina extractor"
)
result = extract_url_jina(url)
result = await extract_url_jina(url)
return result
except Exception as e:
logger.error(f"URL extraction failed for URL: {url}")

View file

@ -1,114 +1,141 @@
import asyncio
import json
import os
import subprocess
from functools import partial
from loguru import logger
from open_notebook.graphs.content_processing.state import SourceState
from open_notebook.graphs.content_processing.state import ContentState
def extract_audio_from_video(input_file, output_file, stream_index):
async def extract_audio_from_video(input_file, output_file, stream_index):
"""
Extract the specified audio stream to MP3 format
Extract the specified audio stream to MP3 format asynchronously
"""
try:
cmd = [
"ffmpeg",
"-i",
input_file,
"-map",
f"0:a:{stream_index}", # Select specific audio stream
"-codec:a",
"libmp3lame", # Use MP3 codec
"-q:a",
"2", # High quality setting
"-y", # Overwrite output file if exists
output_file,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"FFmpeg failed: {result.stderr}")
def _extract(input_file, output_file, stream_index):
try:
cmd = [
"ffmpeg",
"-i",
input_file,
"-map",
f"0:a:{stream_index}", # Select specific audio stream
"-codec:a",
"libmp3lame", # Use MP3 codec
"-q:a",
"2", # High quality setting
"-y", # Overwrite output file if exists
output_file,
]
return True
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"FFmpeg failed: {result.stderr}")
except Exception as e:
print(f"Error extracting audio: {str(e)}")
return False
return True
except Exception as e:
logger.error(f"Error extracting audio: {str(e)}")
return False
return await asyncio.get_event_loop().run_in_executor(
None, partial(_extract, input_file, output_file, stream_index)
)
def get_audio_streams(input_file):
async def get_audio_streams(input_file):
"""
Analyze video file and return information about all audio streams
Analyze video file and return information about all audio streams asynchronously
"""
logger.debug(f"Analyzing video file {input_file} for audio streams")
try:
# Get stream information in JSON format
cmd = [
"ffprobe",
"-v",
"quiet",
"-print_format",
"json",
"-show_streams",
"-select_streams",
"a",
input_file,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"FFprobe failed: {result.stderr}")
def _analyze(input_file):
logger.debug(f"Analyzing video file {input_file} for audio streams")
try:
cmd = [
"ffprobe",
"-v",
"quiet",
"-print_format",
"json",
"-show_streams",
"-select_streams",
"a",
input_file,
]
data = json.loads(result.stdout)
return data.get("streams", [])
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"FFprobe failed: {result.stderr}")
except Exception as e:
print(f"Error analyzing file: {str(e)}")
return []
data = json.loads(result.stdout)
return data.get("streams", [])
except Exception as e:
logger.error(f"Error analyzing file: {str(e)}")
return []
return await asyncio.get_event_loop().run_in_executor(
None, partial(_analyze, input_file)
)
def select_best_audio_stream(streams):
async def select_best_audio_stream(streams):
"""
Select the best audio stream based on various quality metrics
"""
if not streams:
logger.debug("No audio streams found")
return None
else:
logger.debug(f"Found {len(streams)} audio streams")
# Score each stream based on various factors
scored_streams = []
for stream in streams:
score = 0
def _select(streams):
if not streams:
logger.debug("No audio streams found")
return None
else:
logger.debug(f"Found {len(streams)} audio streams")
# Prefer higher bit rates
bit_rate = stream.get("bit_rate")
if bit_rate:
score += int(int(bit_rate) / 1000000) # Convert to Mbps and ensure int
# Score each stream based on various factors
scored_streams = []
for stream in streams:
score = 0
# Prefer more channels (stereo over mono)
channels = stream.get("channels", 0)
score += channels * 10
# Prefer higher bit rates
bit_rate = stream.get("bit_rate")
if bit_rate:
score += int(int(bit_rate) / 1000000) # Convert to Mbps and ensure int
# Prefer higher sample rates
sample_rate = stream.get("sample_rate", "0")
score += int(int(sample_rate) / 48000)
# Prefer more channels (stereo over mono)
channels = stream.get("channels", 0)
score += channels * 10
scored_streams.append((score, stream))
# Prefer higher sample rates
sample_rate = stream.get("sample_rate", "0")
score += int(int(sample_rate) / 48000)
# Return the stream with highest score
return max(scored_streams, key=lambda x: x[0])[1]
scored_streams.append((score, stream))
# Return the stream with highest score
return max(scored_streams, key=lambda x: x[0])[1]
return await asyncio.get_event_loop().run_in_executor(
None, partial(_select, streams)
)
def extract_best_audio_from_video(data: SourceState):
async def extract_best_audio_from_video(data: ContentState):
"""
Main function to extract the best audio stream from a video file
Main function to extract the best audio stream from a video file asynchronously
"""
input_file = data.get("file_path")
assert input_file is not None, "Input file path must be provided"
if not os.path.exists(input_file):
def _check_file(path):
return os.path.exists(path)
file_exists = await asyncio.get_event_loop().run_in_executor(
None, partial(_check_file, input_file)
)
if not file_exists:
logger.critical(f"Input file not found: {input_file}")
return False
@ -116,20 +143,20 @@ def extract_best_audio_from_video(data: SourceState):
output_file = f"{base_name}_audio.mp3"
# Get all audio streams
streams = get_audio_streams(input_file)
streams = await get_audio_streams(input_file)
if not streams:
logger.debug("No audio streams found in the file")
return False
# Select best stream
best_stream = select_best_audio_stream(streams)
best_stream = await select_best_audio_stream(streams)
if not best_stream:
logger.error("Could not determine best audio stream")
return False
# Extract the selected stream
stream_index = streams.index(best_stream)
success = extract_audio_from_video(input_file, output_file, stream_index)
success = await extract_audio_from_video(input_file, output_file, stream_index)
if success:
logger.debug(f"Successfully extracted audio to: {output_file}")

View file

@ -1,7 +1,7 @@
import re
import ssl
import requests
import aiohttp
from bs4 import BeautifulSoup
from loguru import logger
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
@ -9,16 +9,20 @@ from youtube_transcript_api.formatters import TextFormatter # type: ignore
from open_notebook.config import CONFIG
from open_notebook.exceptions import NoTranscriptFound
from open_notebook.graphs.content_processing.state import SourceState
from open_notebook.graphs.content_processing.state import ContentState
ssl._create_default_https_context = ssl._create_unverified_context
def get_video_title(video_id):
async def get_video_title(video_id):
try:
url = f"https://www.youtube.com/watch?v={video_id}"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
html = await response.text()
# BeautifulSoup doesn't support async operations
soup = BeautifulSoup(html, "html.parser")
# YouTube stores title in a meta tag
title = soup.find("meta", property="og:title")["content"]
@ -63,7 +67,7 @@ def _extract_youtube_id(url):
return match.group(1) if match else None
def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
@ -129,7 +133,7 @@ def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
return None
def extract_youtube_transcript(state: SourceState):
async def extract_youtube_transcript(state: ContentState):
"""
Parse the text file and print its content.
"""
@ -139,12 +143,12 @@ def extract_youtube_transcript(state: SourceState):
)
video_id = _extract_youtube_id(state.get("url"))
transcript = get_best_transcript(video_id, languages)
transcript = await get_best_transcript(video_id, languages)
logger.debug(f"Found transcript: {transcript}")
formatter = TextFormatter()
try:
title = get_video_title(video_id)
title = await get_video_title(video_id)
except Exception as e:
logger.critical(f"Failed to get video title for video_id: {video_id}")
logger.exception(e)

View file

@ -1,44 +0,0 @@
from typing import Annotated
from langchain_core.runnables import (
RunnableConfig,
)
from langgraph.graph import START, StateGraph
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode, tools_condition
from typing_extensions import TypedDict
from open_notebook.graphs.tools import repository_search
from open_notebook.graphs.utils import provision_langchain_model
from open_notebook.prompter import Prompter
tools = [repository_search]
tool_node = ToolNode(tools)
class ThreadState(TypedDict):
messages: Annotated[list, add_messages]
# notebook: Optional[Notebook]
# context: Optional[str]
# context_config: Optional[dict]
def call_model_with_messages(state: ThreadState, config: RunnableConfig) -> dict:
system_prompt = Prompter(prompt_template="rag").render(data=state)
payload = [system_prompt] + state.get("messages", [])
model = provision_langchain_model(str(payload), config, "tools", max_tokens=2000)
model = model.bind_tools(tools)
ai_message = model.invoke(payload)
return {"messages": ai_message}
agent_state = StateGraph(ThreadState)
agent_state.add_node("agent", call_model_with_messages)
agent_state.add_node("tools", tool_node)
agent_state.add_edge(START, "agent")
agent_state.add_conditional_edges(
"agent",
tools_condition,
)
agent_state.add_edge("tools", "agent")
graph = agent_state.compile()

View file

@ -0,0 +1,126 @@
import operator
from typing import List
from langchain_core.runnables import (
RunnableConfig,
)
from langgraph.graph import END, START, StateGraph
from langgraph.types import Send
from loguru import logger
from typing_extensions import Annotated, TypedDict
from open_notebook.domain.notebook import Asset, Source
from open_notebook.domain.transformation import Transformation
from open_notebook.graphs.content_processing import ContentState
from open_notebook.graphs.content_processing import graph as content_graph
from open_notebook.graphs.multipattern import graph as transform_graph
from open_notebook.utils import surreal_clean
class SourceState(TypedDict):
content_state: ContentState
transformations: List[str]
notebook_id: str
source: Source
transformations: Annotated[list, operator.add]
embed: bool
class TransformationState(TypedDict):
source: Source
transformation: dict
async def content_process(state: SourceState) -> dict:
content_state = state["content_state"]
logger.info("Content processing started for new content")
processed_state = await content_graph.ainvoke(content_state)
return {"content_state": processed_state}
async def run_patterns(input_text: str, patterns: List[dict]) -> str:
output = await transform_graph.ainvoke(
dict(content_stack=[input_text], patterns=patterns)
)
return output["output"]
def save_source(state: SourceState) -> dict:
content_state = state["content_state"]
source = Source(
asset=Asset(
url=content_state.get("url"), file_path=content_state.get("file_path")
),
full_text=surreal_clean(content_state["content"]),
title=content_state.get("title"),
)
source.save()
if state["notebook_id"]:
logger.debug(f"Adding source to notebook {state['notebook_id']}")
source.add_to_notebook(state["notebook_id"])
if state["embed"]:
logger.debug("Embedding content for vector search")
source.vectorize()
return {"source": source}
def trigger_transformations(state: SourceState, config: RunnableConfig) -> List[Send]:
if len(state["transformations"]) == 0:
return []
transformations = Transformation.get_all()
to_apply = [
t
for t in transformations["source_insights"]
if t["name"] in state["transformations"]
]
logger.debug(f"Applying transformations {to_apply}")
return [
Send(
"transform_content",
{
"source": state["source"],
"transformation": t,
},
)
for t in to_apply
]
async def transform_content(state: TransformationState) -> dict:
source = state["source"]
content = source.full_text
if not content:
return None
transformation = state["transformation"]
logger.debug(f"Applying transformation {transformation['name']}")
result = await run_patterns(content, patterns=transformation["patterns"])
source.add_insight(transformation["name"], surreal_clean(result))
return {"transformations": [{"name": transformation["name"], "content": result}]}
# Create and compile the workflow
workflow = StateGraph(SourceState)
# Add nodes
workflow.add_node("content_process", content_process)
workflow.add_node("save_source", save_source)
workflow.add_node("transform_content", transform_content)
# Define the graph edges
workflow.add_edge(START, "content_process")
workflow.add_edge("content_process", "save_source")
workflow.add_conditional_edges(
"save_source", trigger_transformations, ["transform_content"]
)
workflow.add_edge("transform_content", END)
# Compile the graph
source_graph = workflow.compile()

View file

@ -1,10 +1,7 @@
from datetime import datetime
from typing import List
from langchain.tools import tool
from open_notebook.domain.notebook import hybrid_search
# todo: turn this into a system prompt variable
@tool
@ -14,14 +11,3 @@ def get_current_timestamp() -> str:
Returns the current timestamp in the format YYYYMMDDHHmmss.
"""
return datetime.now().strftime("%Y%m%d%H%M%S")
@tool
def repository_search(keyword_searches: List[str], vector_searches: List[str]) -> str:
"""
name: repository_search
Makes a search in the content repository for the given query.
keyword_searches: List[str] - A list of search terms to search for using keyword search.
vector_searches: List[str] - A list of search terms to search for using vector search.
"""
return hybrid_search(keyword_searches, vector_searches, 20)

View file

@ -1,5 +1,5 @@
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import BaseMessage
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
from loguru import logger
from open_notebook.domain.models import model_manager
@ -8,7 +8,9 @@ from open_notebook.prompter import Prompter
from open_notebook.utils import token_count
def provision_langchain_model(content, config, default_type, **kwargs) -> BaseChatModel:
def provision_langchain_model(
content, model_id, default_type, **kwargs
) -> BaseChatModel:
"""
Returns the best model to use based on the context size and on whether there is a specific model being requested in Config.
If context > 105_000, returns the large_context_model
@ -22,10 +24,8 @@ def provision_langchain_model(content, config, default_type, **kwargs) -> BaseCh
f"Using large context model because the content has {tokens} tokens"
)
model = model_manager.get_default_model("large_context", **kwargs)
elif config.get("configurable", {}).get("model_id"):
model = model_manager.get_model(
config.get("configurable", {}).get("model_id"), **kwargs
)
elif model_id:
model = model_manager.get_model(model_id, **kwargs)
else:
model = model_manager.get_default_model(default_type, **kwargs)
@ -37,16 +37,18 @@ def provision_langchain_model(content, config, default_type, **kwargs) -> BaseCh
def run_pattern(
pattern_name: str,
config,
messages=[],
state: dict = {},
parser=None,
) -> BaseMessage:
system_prompt = Prompter(prompt_template=pattern_name, parser=parser).render(
data=state
)
payload = [system_prompt] + messages
chain = provision_langchain_model(str(payload), config, "transformation")
payload = [SystemMessage(content=system_prompt)] + [
HumanMessage(content=state["input_text"])
]
chain = provision_langchain_model(
str(payload), config.get("configurable", {}).get("model_id"), "transformation"
)
response = chain.invoke(payload)
return response

View file

@ -10,6 +10,7 @@ from open_notebook.models.embedding_models import (
from open_notebook.models.llms import (
AnthropicLanguageModel,
GeminiLanguageModel,
GroqLanguageModel,
LanguageModel,
LiteLLMLanguageModel,
OllamaLanguageModel,
@ -17,8 +18,10 @@ from open_notebook.models.llms import (
OpenRouterLanguageModel,
VertexAILanguageModel,
VertexAnthropicLanguageModel,
XAILanguageModel,
)
from open_notebook.models.speech_to_text_models import (
GroqSpeechToTextModel,
OpenAISpeechToTextModel,
SpeechToTextModel,
)
@ -44,6 +47,8 @@ MODEL_CLASS_MAP: Dict[str, ProviderMap] = {
"anthropic": AnthropicLanguageModel,
"openai": OpenAILanguageModel,
"gemini": GeminiLanguageModel,
"xai": XAILanguageModel,
"groq": GroqLanguageModel,
},
"embedding": {
"openai": OpenAIEmbeddingModel,
@ -53,6 +58,7 @@ MODEL_CLASS_MAP: Dict[str, ProviderMap] = {
},
"speech_to_text": {
"openai": OpenAISpeechToTextModel,
"groq": GroqSpeechToTextModel,
},
"text_to_speech": {
"openai": OpenAITextToSpeechModel,

View file

@ -13,6 +13,7 @@ from langchain_core.language_models.chat_models import BaseChatModel
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_vertexai import ChatVertexAI
from langchain_google_vertexai.model_garden import ChatAnthropicVertex
from langchain_groq.chat_models import ChatGroq
from langchain_ollama.chat_models import ChatOllama
from langchain_openai.chat_models import ChatOpenAI
from pydantic import SecretStr
@ -171,7 +172,7 @@ class OpenRouterLanguageModel(LanguageModel):
def to_langchain(self) -> ChatOpenAI:
"""
Convert the language model to a LangChain chat model.
Convert the language model to a LangChain chat model for Open Router.
"""
kwargs = self.kwargs
if self.json:
@ -191,6 +192,58 @@ class OpenRouterLanguageModel(LanguageModel):
)
@dataclass
class GroqLanguageModel(LanguageModel):
"""
Language model that uses the Groq chat model.
"""
model_name: str
def to_langchain(self) -> ChatGroq:
"""
Convert the language model to a LangChain chat model for Groq.
"""
kwargs = self.kwargs
kwargs["top_p"] = self.top_p
return ChatGroq(
model=self.model_name,
temperature=self.temperature or 0.5,
max_tokens=self.max_tokens,
model_kwargs=kwargs,
stop_sequences=None,
)
@dataclass
class XAILanguageModel(LanguageModel):
"""
Language model that uses the OpenAI chat model for X.AI.
"""
model_name: str
def to_langchain(self) -> ChatOpenAI:
"""
Convert the language model to a LangChain chat model.
"""
kwargs = self.kwargs
if self.json:
kwargs["response_format"] = {"type": "json_object"}
return ChatOpenAI(
model=self.model_name,
temperature=self.temperature or 0.5,
base_url=os.environ.get("XAI_BASE_URL", "https://api.x.ai/v1"),
max_tokens=self.max_tokens,
model_kwargs=kwargs,
streaming=self.streaming,
api_key=SecretStr(os.environ.get("XAI_API_KEY", "xai")),
top_p=self.top_p,
)
@dataclass
class AnthropicLanguageModel(LanguageModel):
"""
@ -226,15 +279,27 @@ class OpenAILanguageModel(LanguageModel):
"""
Convert the language model to a LangChain chat model.
"""
kwargs = self.kwargs
kwargs = self.kwargs.copy() # Make a copy to avoid modifying the original
if self.json:
kwargs["response_format"] = {"type": "json_object"}
# Set the token limit in kwargs with the appropriate key
if self.model_name in ["o1-mini", "o1-preview"]:
kwargs["max_completion_tokens"] = self.max_tokens
top_p = 1
streaming = False
max_tokens = None
else:
max_tokens = self.max_tokens
top_p = self.top_p
streaming = self.streaming
return ChatOpenAI(
model=self.model_name,
temperature=self.temperature or 0.5,
max_tokens=self.max_tokens,
temperature=self.temperature,
streaming=streaming,
max_tokens=max_tokens,
top_p=top_p,
model_kwargs=kwargs,
streaming=self.streaming,
top_p=self.top_p,
)

View file

@ -40,3 +40,22 @@ class OpenAISpeechToTextModel(SpeechToTextModel):
model=self.model_name, file=audio
)
return transcription.text
@dataclass
class GroqSpeechToTextModel(SpeechToTextModel):
model_name: str
def transcribe(self, audio_file_path: str) -> str:
"""
Transcribes an audio file into text
"""
from groq import Groq
# todo: make this Singleton
client = Groq()
with open(audio_file_path, "rb") as audio:
transcription = client.audio.transcriptions.create(
model=self.model_name, file=audio
)
return transcription.text

View file

@ -2,8 +2,9 @@ from typing import ClassVar, List, Optional
from loguru import logger
from podcastfy.client import generate_podcast
from pydantic import Field, field_validator
from pydantic import Field, field_validator, model_validator
from open_notebook.config import DATA_FOLDER
from open_notebook.domain.notebook import ObjectModel
@ -22,26 +23,46 @@ class PodcastConfig(ObjectModel):
podcast_name: str
podcast_tagline: str
output_language: str = Field(default="English")
person1_role: str
person2_role: str
person1_role: List[str]
person2_role: List[str]
conversation_style: List[str]
engagement_technique: List[str]
dialogue_structure: List[str]
transcript_model: Optional[str] = None
transcript_model_provider: Optional[str] = None
user_instructions: Optional[str] = None
ending_message: Optional[str] = None
wordcount: int = Field(ge=400, le=10000)
creativity: float = Field(ge=0, le=1)
provider: str = Field(default="openai")
voice1: Optional[str] = None
voice2: Optional[str] = None
voice1: str
voice2: str
model: str
def generate_episode(self, episode_name, text, instructions=None):
# Backwards compatibility
@field_validator("person1_role", "person2_role", mode="before")
@classmethod
def split_string_to_list(cls, value):
if isinstance(value, str):
return [item.strip() for item in value.split(",")]
return value
@model_validator(mode="after")
def validate_voices(self) -> "PodcastConfig":
if not self.voice1 or not self.voice2:
raise ValueError("Both voice1 and voice2 must be provided")
return self
def generate_episode(
self,
episode_name: str,
text: str,
instructions: str = "",
longform: bool = False,
):
self.user_instructions = (
instructions if instructions else self.user_instructions
)
conversation_config = {
"word_count": self.wordcount,
"conversation_style": self.conversation_style,
"roles_person1": self.person1_role,
"roles_person2": self.person2_role,
@ -53,7 +74,11 @@ class PodcastConfig(ObjectModel):
"engagement_techniques": self.engagement_technique,
"creativity": self.creativity,
"text_to_speech": {
# "temp_audio_dir": f"{PODCASTS_FOLDER}/tmp",
"output_directories": {
"transcripts": f"{DATA_FOLDER}/podcasts/transcripts",
"audio": f"{DATA_FOLDER}/podcasts/audio",
},
"temp_audio_dir": f"{DATA_FOLDER}/podcasts/audio/tmp",
"ending_message": "Thank you for listening to this episode. Don't forget to subscribe to our podcast for more interesting conversations.",
"default_tts_model": self.provider,
self.provider: {
@ -67,12 +92,30 @@ class PodcastConfig(ObjectModel):
},
}
api_key_label = None
llm_model_name = None
if self.transcript_model_provider:
if self.transcript_model_provider == "openai":
api_key_label = "OPENAI_API_KEY"
llm_model_name = self.transcript_model
elif self.transcript_model_provider == "anthropic":
api_key_label = "ANTHROPIC_API_KEY"
llm_model_name = self.transcript_model
elif self.transcript_model_provider == "gemini":
api_key_label = "GEMINI_API_KEY"
llm_model_name = self.transcript_model
logger.debug(
f"Generating episode {episode_name} with config {conversation_config}"
f"Generating episode {episode_name} with config {conversation_config} and using model {llm_model_name}"
)
audio_file = generate_podcast(
conversation_config=conversation_config, text=text, tts_model=self.provider
conversation_config=conversation_config,
text=text,
tts_model=self.provider,
llm_model_name=llm_model_name,
api_key_label=api_key_label,
longform=longform,
)
episode = PodcastEpisode(
name=episode_name,
@ -92,12 +135,6 @@ class PodcastConfig(ObjectModel):
raise ValueError(f"{field.field_name} cannot be None or empty string")
return value.strip()
@field_validator("wordcount")
def validate_wordcount(cls, value):
if not 400 <= value <= 6000:
raise ValueError("Wordcount must be between 400 and 10000")
return value
@field_validator("creativity")
def validate_creativity(cls, value):
if not 0 <= value <= 1:
@ -116,13 +153,8 @@ conversation_styles = [
"Debate-style",
"Interview-style",
"Storytelling",
"Reflective",
"Narrative",
"Satirical",
"Educational",
"Conversational",
"Critical",
"Empathetic",
"Philosophical",
"Speculative",
"Motivational",
@ -132,25 +164,15 @@ conversation_styles = [
"Serious",
"Investigative",
"Debunking",
"Collaborative",
"Didactic",
"Thought-provoking",
"Controversial",
"Skeptical",
"Optimistic",
"Pessimistic",
"Objective",
"Subjective",
"Sarcastic",
"Emotional",
"Exploratory",
"Friendly",
"Fast-paced",
"Slow-paced",
"Introspective",
"Open-ended",
"Affirmative",
"Dissenting",
]
# Dialogue Structures
@ -167,15 +189,10 @@ dialogue_structures = [
"Pro Arguments",
"Con Arguments",
"Cross-examination",
"Rebuttal",
"Expert Interviews",
"Panel Discussion",
"Case Studies",
"Myth Busting",
"Debunking Misconceptions",
"Audience Questions",
"Q&A Session",
"Listener Feedback",
"Rapid-fire Questions",
"Summary of Key Points",
"Recap",
@ -183,29 +200,11 @@ dialogue_structures = [
"Actionable Tips",
"Call to Action",
"Future Outlook",
"Teaser for Next Episode",
"Closing Remarks",
"Thank You and Credits",
"Outtakes or Bloopers",
"Sponsor Messages",
"Social Media Shout-outs",
"Resource Recommendations",
"Feedback Request",
"Lightning Round",
"Behind-the-Scenes Insights",
"Ethical Considerations",
"Fact-checking Segment",
"Trending Topics",
"Closing Inspirational Quote",
"Final Reflections",
"Debrief",
"Farewell Messages",
"Next Episode Preview",
"Live Reactions",
"Call-in Segment",
"Acknowledgements",
"Transition Segments",
"Break Segments",
]
# Podcast Participant Roles
@ -241,15 +240,7 @@ participant_roles = [
"Researcher",
"Reporter",
"Advocate",
"Influencer",
"Observer",
"Listener",
"Facilitator",
"Innovator",
"Debater",
"Educator",
"Motivator",
"Narrator",
"Explorer",
"Opponent",
"Proponent",
@ -265,49 +256,17 @@ participant_roles = [
"Author",
"Journalist",
"Activist",
"Challenger",
"Supporter",
"Mentor",
"Mentee",
"Panelist",
"Audience Representative",
"Case Study Presenter",
"Data Analyst",
"Ethicist",
"Cultural Critic",
"Technologist",
"Environmentalist",
"Legal Expert",
"Healthcare Professional",
"Financial Advisor",
"Policy Maker",
"Sociologist",
"Anthropologist",
"Myth Buster",
"Trend Analyst",
"Futurist",
"Negotiator",
"Community Leader",
"Voice of Reason",
"Conflict Resolver",
"Emotional Support",
"Pragmatist",
"Idealist",
"Realist",
"Satirist",
"Story Analyst",
"Language Expert",
"Historical Witness",
"Survivor",
"Inspirational Figure",
"Cultural Ambassador",
"Digital Nomad",
"Remote Correspondent",
"Field Reporter",
"Data Scientist",
"Gamer",
"Musician",
"Filmmaker",
]
# Engagement Techniques

View file

@ -10,4 +10,28 @@ youtube_transcripts:
- fr
- de
- hi
- ja
- ja
suggested_models:
openai:
language:
- gpt-4o-mini
embedding:
- text-embedding-3-small
text_to_speech:
- tts-1-hd
speech_to_text:
- whisper-1
gemini:
language:
- gemini-1.5-flash
text_to_speech:
- default
xai:
language:
- grok-beta
anthropic:
language:
- claude-3-5-sonnet-20241022
elevenlabs:
text_to_speech:
- eleven_turbo_v2_5

View file

@ -1,8 +1,11 @@
import asyncio
import streamlit as st
from open_notebook.domain.models import Model
from open_notebook.domain.notebook import text_search, vector_search
from open_notebook.graphs.rag import graph as rag_graph
from open_notebook.domain.models import DefaultModels
from open_notebook.domain.notebook import Note, Notebook, text_search, vector_search
from open_notebook.graphs.ask import graph as ask_graph
from pages.components.model_selector import model_selector
from pages.stream_app.utils import convert_source_references, setup_page
setup_page("🔍 Search")
@ -12,13 +15,37 @@ ask_tab, search_tab = st.tabs(["Ask Your Knowledge Base (beta)", "Search"])
if "search_results" not in st.session_state:
st.session_state["search_results"] = []
if "ask_results" not in st.session_state:
st.session_state["ask_results"] = {}
async def process_ask_query(question, strategy_model, answer_model, final_answer_model):
async for chunk in ask_graph.astream(
input=dict(
question=question,
),
config=dict(
configurable=dict(
strategy_model=strategy_model.id,
answer_model=answer_model.id,
final_answer_model=final_answer_model.id,
)
),
stream_mode="updates",
):
yield (chunk)
def results_card(item):
score = item.get("relevance", item.get("similarity", item.get("score", 0)))
with st.expander(f"[{score:.2f}] **{item['title']}**"):
st.markdown(f"**{item['content']}**")
st.write(item["id"])
st.write(item["parent_id"])
with st.container(border=True):
st.markdown(
f"[{score:.2f}] **[{item['title']}](/?object_id={item['parent_id']})**"
)
if "matches" in item:
with st.expander("Matches"):
for match in item["matches"]:
st.markdown(match)
with ask_tab:
@ -26,24 +53,80 @@ with ask_tab:
st.caption(
"The LLM will answer your query based on the documents in your knowledge base. "
)
st.warning(
"This functionality requires the use of Tools and, at this moment, works well with Open AI and Anthropic models only."
)
question = st.text_input("Question", "")
models = Model.get_models_by_type("language")
model: Model = st.selectbox("Model", models, format_func=lambda x: x.name)
if st.button("Ask"):
st.write(f"Searching for {question}")
messages = [question]
rag_results = rag_graph.invoke(
dict(
messages=messages,
),
config=dict(configurable=dict(model_id=model.id)),
)
st.markdown(convert_source_references(rag_results["messages"][-1].content))
with st.expander("Details (for debugging)"):
st.json(rag_results)
default_model = DefaultModels().load().default_chat_model
strategy_model = model_selector(
"Query Strategy Model",
"strategy_model",
selected_id=default_model,
model_type="language",
help="This is the LLM that will be responsible for strategizing the search",
)
answer_model = model_selector(
"Individual Answer Model",
"answer_model",
model_type="language",
selected_id=default_model,
help="This is the LLM that will be responsible for processing individual subqueries",
)
final_answer_model = model_selector(
"Final Answer Model",
"final_answer_model",
model_type="language",
selected_id=default_model,
help="This is the LLM that will be responsible for processing the final answer",
)
ask_bt = st.button("Ask")
placeholder = st.container()
async def stream_results():
async for chunk in process_ask_query(
question, strategy_model, answer_model, final_answer_model
):
if "agent" in chunk:
with placeholder.expander(
f"Agent Strategy: {chunk['agent']['strategy'].reasoning}"
):
for search in chunk["agent"]["strategy"].searches:
st.markdown(f"Searched for: **{search.term}**")
st.markdown(f"Instructions: {search.instructions}")
elif "provide_answer" in chunk:
for answer in chunk["provide_answer"]["answers"]:
with placeholder.expander("Answer"):
st.markdown(convert_source_references(answer))
elif "write_final_answer" in chunk:
st.session_state["ask_results"]["answer"] = chunk["write_final_answer"][
"final_answer"
]
with placeholder.container(border=True):
st.markdown(
convert_source_references(
chunk["write_final_answer"]["final_answer"]
)
)
if ask_bt:
placeholder.write(f"Searching for {question}")
st.session_state["ask_results"]["question"] = question
st.session_state["ask_results"]["answer"] = None
asyncio.run(stream_results())
if st.session_state["ask_results"].get("answer"):
with st.container(border=True):
with st.form("save_note_form"):
notebook = st.selectbox(
"Notebook", Notebook.get_all(), format_func=lambda x: x.name
)
if st.form_submit_button("Save Answer as Note"):
note = Note(
title=st.session_state["ask_results"]["question"],
content=st.session_state["ask_results"]["answer"],
)
note.save()
note.add_to_notebook(notebook.id)
st.success("Note saved successfully")
with search_tab:
with st.container(border=True):

View file

@ -18,7 +18,6 @@ setup_page("🎙️ Podcasts")
text_to_speech_models = Model.get_models_by_type("text_to_speech")
provider_models: Dict[str, List[str]] = {}
for model in text_to_speech_models:
@ -26,11 +25,28 @@ for model in text_to_speech_models:
provider_models[model.provider] = []
provider_models[model.provider].append(model.name)
text_models = Model.get_models_by_type("language")
transcript_provider_models: Dict[str, List[str]] = {}
for model in text_models:
if model.provider not in ["gemini", "openai", "anthropic"]:
continue
if model.provider not in transcript_provider_models:
transcript_provider_models[model.provider] = []
transcript_provider_models[model.provider].append(model.name)
if len(text_to_speech_models) == 0:
st.error("No text to speech models found. Please set one up in the Settings page.")
st.stop()
if len(text_models) == 0:
st.error(
"No language models found. Please set one up in the Settings page. Only Gemini, Open AI and Anthropic models supported for transcript generation."
)
st.stop()
episodes_tab, templates_tab = st.tabs(["Episodes", "Templates"])
with episodes_tab:
@ -66,34 +82,56 @@ with templates_tab:
"User Instructions",
help="Any additional intructions to pass to the LLM that will generate the transcript",
)
pd_cfg["person1_role"] = st.text_input("Person 1 role")
pd_cfg["person1_role"] = st_tags(
[], participant_roles, "Person 1 roles", key="person1_roles"
)
st.caption(f"Suggestions:{', '.join(participant_roles)}")
pd_cfg["person2_role"] = st.text_input("Person 2 role")
pd_cfg["person2_role"] = st_tags(
[], participant_roles, "Person 2 roles", key="person2_roles"
)
pd_cfg["conversation_style"] = st_tags(
[], conversation_styles, "Conversation Style"
[], conversation_styles, "Conversation Style", key="conversation_styles"
)
st.caption(f"Suggestions:{', '.join(conversation_styles)}")
pd_cfg["engagement_technique"] = st_tags(
[], engagement_techniques, "Engagement Techniques"
[],
engagement_techniques,
"Engagement Techniques",
key="engagement_techniques",
)
st.caption(f"Suggestions:{', '.join(engagement_techniques)}")
pd_cfg["dialogue_structure"] = st_tags(
[], dialogue_structures, "Dialogue Structure"
[], dialogue_structures, "Dialogue Structure", key="dialogue_structures"
)
st.caption(f"Suggestions:{', '.join(dialogue_structures)}")
pd_cfg["wordcount"] = st.slider(
"Word Count", min_value=400, max_value=6000, step=50
)
pd_cfg["creativity"] = st.slider(
"Creativity", min_value=0.0, max_value=1.0, step=0.05
)
pd_cfg["ending_message"] = st.text_input(
"Ending Message", placeholder="Thank you for listening!"
)
pd_cfg["provider"] = st.selectbox("Provider", provider_models.keys())
pd_cfg["transcript_model_provider"] = st.selectbox(
"Transcript Model Provider", transcript_provider_models.keys()
)
pd_cfg["transcript_model"] = st.selectbox(
"Transcript Model",
transcript_provider_models[pd_cfg["transcript_model_provider"]],
)
pd_cfg["provider"] = st.selectbox(
"Audio Model Provider", provider_models.keys()
)
pd_cfg["model"] = st.selectbox(
"Audio Model", provider_models[pd_cfg["provider"]]
)
st.caption(
"OpenAI: tts-1 or tts-1-hd, Elevenlabs: eleven_multilingual_v2, eleven_turbo_v2_5"
)
pd_cfg["voice1"] = st.text_input(
"Voice 1", help="You can use Elevenlabs voice ID"
)
st.caption("Voice names are case sensitive. Be sure to add the exact name.")
st.markdown(
"[Open AI voices](https://platform.openai.com/docs/guides/text-to-speech)"
)
@ -105,19 +143,13 @@ with templates_tab:
"Voice 2", help="You can use Elevenlabs voice ID"
)
pd_cfg["model"] = st.selectbox("Model", provider_models[pd_cfg["provider"]])
st.caption(
"OpenAI: tts-1 or tts-1-hd, Elevenlabs: eleven_multilingual_v2, eleven_turbo_v2_5"
)
if st.button("Save"):
try:
pd = PodcastConfig(**pd_cfg)
pd_cfg = {}
pd.save()
st.rerun()
except Exception as e:
st.error(e)
st.exception(e)
for pd_config in PodcastConfig.get_all(order_by="created desc"):
with st.expander(pd_config.name):
@ -146,17 +178,20 @@ with templates_tab:
value=pd_config.output_language,
key=f"output_language_{pd_config.id}",
)
pd_config.person1_role = st.text_input(
"Person 1 role",
value=pd_config.person1_role,
key=f"person1_role_{pd_config.id}",
pd_config.person1_role = st_tags(
pd_config.person1_role,
conversation_styles,
"Person 1 Roles",
key=f"person_1_roles_{pd_config.id}",
)
st.caption(f"Suggestions:{', '.join(participant_roles)}")
pd_config.person2_role = st.text_input(
"Person 2 role",
value=pd_config.person2_role,
key=f"person2_role_{pd_config.id}",
pd_config.person2_role = st_tags(
pd_config.person2_role,
conversation_styles,
"Person 2 Roles",
key=f"person_2_roles_{pd_config.id}",
)
pd_config.conversation_style = st_tags(
pd_config.conversation_style,
conversation_styles,
@ -178,14 +213,6 @@ with templates_tab:
key=f"dialogue_structure_{pd_config.id}",
)
st.caption(f"Suggestions:{', '.join(dialogue_structures)}")
pd_config.wordcount = st.slider(
"Word Count",
min_value=400,
max_value=6000,
step=50,
value=pd_config.wordcount,
key=f"wordcount_{pd_config.id}",
)
pd_config.creativity = st.slider(
"Creativity",
min_value=0.0,
@ -200,32 +227,44 @@ with templates_tab:
placeholder="Thank you for listening!",
key=f"ending_message_{pd_config.id}",
)
if pd_config.transcript_model_provider not in transcript_provider_models:
index = 0
else:
index = list(transcript_provider_models.keys()).index(
pd_config.transcript_model_provider
)
pd_config.transcript_model_provider = st.selectbox(
"Transcript Model Provider",
list(transcript_provider_models.keys()),
index=index,
key=f"transcript_provider_{pd_config.id}",
)
if (
not pd_config.transcript_model
or pd_config.transcript_model
not in transcript_provider_models[pd_config.transcript_model_provider]
):
index = 0
else:
index = transcript_provider_models[
pd_config.transcript_model_provider
].index(pd_config.transcript_model)
pd_config.transcript_model = st.selectbox(
"Transcript Model",
transcript_provider_models[pd_config.transcript_model_provider],
index=index,
key=f"transcript_model_{pd_config.id}",
)
pd_config.provider = st.selectbox(
"Provider",
"Audio Model Provider",
list(provider_models.keys()),
index=list(provider_models.keys()).index(pd_config.provider),
key=f"provider_{pd_config.id}",
)
pd_config.voice1 = st.text_input(
"Voice 1",
value=pd_config.voice1,
key=f"voice1_{pd_config.id}",
help="You can use Elevenlabs voice ID",
)
st.markdown(
"[Open AI voices](https://platform.openai.com/docs/guides/text-to-speech)"
)
st.markdown(
"[Gemini voices](https://cloud.google.com/text-to-speech/docs/voices)"
)
pd_config.voice2 = st.text_input(
"Voice 2",
value=pd_config.voice2,
key=f"voice2_{pd_config.id}",
help="You can use Elevenlabs voice ID",
)
if pd_config.model not in provider_models[pd_config.provider]:
st.warning(f"Model {pd_config.model} not setup. Changing to default.")
index = 0
else:
index = provider_models[pd_config.provider].index(pd_config.model)
@ -238,6 +277,25 @@ with templates_tab:
st.caption(
"OpenAI: tts-1 or tts-1-hd, Elevenlabs: eleven_multilingual_v2, eleven_turbo_v2_5"
)
pd_config.voice1 = st.text_input(
"Voice 1",
value=pd_config.voice1,
key=f"voice1_{pd_config.id}",
help="You can use Elevenlabs voice ID",
)
st.caption("Voice names are case sensitive. Be sure to add the exact name.")
st.markdown(
"[Open AI voices](https://platform.openai.com/docs/guides/text-to-speech)"
)
st.markdown(
"[Gemini voices](https://cloud.google.com/text-to-speech/docs/voices)"
)
pd_config.voice2 = st.text_input(
"Voice 2",
value=pd_config.voice2,
key=f"voice2_{pd_config.id}",
help="You can use Elevenlabs voice ID",
)
if st.button("Save Config", key=f"btn_save{pd_config.id}"):
try:

View file

@ -2,8 +2,11 @@ import os
import streamlit as st
from open_notebook.config import CONFIG
from open_notebook.domain.models import DefaultModels, Model, model_manager
from open_notebook.domain.transformation import DefaultTransformations, Transformation
from open_notebook.models import MODEL_CLASS_MAP
from pages.components.model_selector import model_selector
from pages.stream_app.utils import setup_page
setup_page("⚙️ Settings")
@ -11,7 +14,9 @@ setup_page("⚙️ Settings")
st.title("⚙️ Settings")
model_tab, model_defaults_tab = st.tabs(["Models", "Model Defaults"])
model_tab, model_defaults_tab, transformations_tab = st.tabs(
["Models", "Model Defaults", "Transformations"]
)
provider_status = {}
@ -25,6 +30,8 @@ model_types = [
provider_status["ollama"] = os.environ.get("OLLAMA_API_BASE") is not None
provider_status["openai"] = os.environ.get("OPENAI_API_KEY") is not None
provider_status["groq"] = os.environ.get("GROQ_API_KEY") is not None
provider_status["xai"] = os.environ.get("XAI_API_KEY") is not None
provider_status["vertexai"] = (
os.environ.get("VERTEX_PROJECT") is not None
and os.environ.get("VERTEX_LOCATION") is not None
@ -55,8 +62,42 @@ provider_status["litellm"] = (
available_providers = [k for k, v in provider_status.items() if v]
unavailable_providers = [k for k, v in provider_status.items() if not v]
def generate_new_models(models, suggested_models):
# Create a set of existing model keys for efficient lookup
existing_model_keys = {
f"{model.provider}-{model.name}-{model.type}" for model in models
}
new_models = []
# Iterate through suggested models by provider
for provider, types in suggested_models.items():
# Iterate through each type (language, embedding, etc.)
for type_, model_list in types.items():
for model_name in model_list:
model_key = f"{provider}-{model_name}-{type_}"
# Check if model already exists
if model_key not in existing_model_keys:
if provider_status.get(provider):
new_models.append(
{
"name": model_name,
"type": type_,
"provider": provider,
}
)
return new_models
default_models = DefaultModels().model_dump()
all_models = Model.get_all()
with model_tab:
st.subheader("Add Model")
provider = st.selectbox("Provider", available_providers)
if len(unavailable_providers) > 0:
st.caption(
@ -88,8 +129,20 @@ with model_tab:
model = Model(name=model_name, provider=provider, type=model_type)
model.save()
st.success("Saved")
st.divider()
all_models = Model.get_all()
suggested_models = CONFIG.get("suggested_models", [])
recommendations = generate_new_models(all_models, suggested_models)
if len(recommendations) > 0:
with st.expander("💁‍♂️ Recommended models to get you started.."):
for recommendation in recommendations:
st.markdown(
f"**{recommendation['name']}** ({recommendation['provider']}, {recommendation['type']})"
)
if st.button("Add", key=f"add_{recommendation['name']}"):
new_model = Model(**recommendation)
new_model.save()
st.rerun()
st.subheader("Configured Models")
model_types_available = {
# "vision": False,
@ -110,20 +163,7 @@ with model_tab:
if not available:
st.warning(f"No models available for {model_type}")
def get_selected_index(models, model_id, default=0):
"""Returns the index of the selected model in the list of models"""
if not model_id or not models:
return default
for i, model in enumerate(models):
if model.id == model_id:
return i
return default
with model_defaults_tab:
default_models = DefaultModels().model_dump()
all_models = Model.get_all()
text_generation_models = [model for model in all_models if model.type == "language"]
text_to_speech_models = [
@ -139,95 +179,108 @@ with model_defaults_tab:
"In this section, you can select the default models to be used on the various content operations done by Open Notebook. Some of these can be overriden in the different modules."
)
defs = {}
defs["default_chat_model"] = st.selectbox(
defs["default_chat_model"] = model_selector(
"Default Chat Model",
text_generation_models,
format_func=lambda x: x.name,
"default_chat_model",
selected_id=default_models.get("default_chat_model"),
help="This model will be used for chat.",
index=get_selected_index(
text_generation_models, default_models.get("default_chat_model")
),
model_type="language",
)
st.divider()
defs["default_transformation_model"] = st.selectbox(
defs["default_transformation_model"] = model_selector(
"Default Transformation Model",
text_generation_models,
format_func=lambda x: x.name,
"default_transformation_model",
selected_id=default_models.get("default_transformation_model"),
help="This model will be used for text transformations such as summaries, insights, etc.",
index=get_selected_index(
text_generation_models, default_models.get("default_transformation_model")
),
model_type="language",
)
st.caption("You can use a cheap model here like gpt-4o-mini, llama3, etc.")
st.divider()
defs["default_tools_model"] = st.selectbox(
defs["default_tools_model"] = model_selector(
"Default Tools Model",
text_generation_models,
format_func=lambda x: x.name,
"default_tools_model",
selected_id=default_models.get("default_tools_model"),
help="This model will be used for calling tools. Currently, it's best to use Open AI and Anthropic for this.",
index=get_selected_index(
text_generation_models, default_models.get("default_tools_model")
),
model_type="language",
)
st.caption("Recommended to use a capable model here, like gpt-4o, claude, etc.")
st.divider()
defs["large_context_model"] = st.selectbox(
defs["large_context_model"] = model_selector(
"Large Context Model",
text_generation_models,
format_func=lambda x: x.name,
"large_context_model",
selected_id=default_models.get("large_context_model"),
help="This model will be used for larger context generation -- recommended: Gemini",
index=get_selected_index(
text_generation_models, default_models.get("large_context_model")
),
model_type="language",
)
st.caption("Recommended to use Gemini models for larger context processing")
st.divider()
defs["default_text_to_speech_model"] = st.selectbox(
defs["default_text_to_speech_model"] = model_selector(
"Default Text to Speech Model",
text_to_speech_models,
format_func=lambda x: x.name,
"default_text_to_speech_model",
selected_id=default_models.get("default_text_to_speech_model"),
help="This is the default model for converting text to speech (podcasts, etc)",
index=get_selected_index(
text_to_speech_models, default_models.get("default_text_to_speech_model")
),
model_type="text_to_speech",
)
st.caption("You can override this model on different podcasts")
st.divider()
defs["default_speech_to_text_model"] = st.selectbox(
defs["default_speech_to_text_model"] = model_selector(
"Default Speech to Text Model",
speech_to_text_models,
format_func=lambda x: x.name,
"default_speech_to_text_model",
selected_id=default_models.get("default_speech_to_text_model"),
help="This is the default model for converting speech to text (audio transcriptions, etc)",
index=get_selected_index(
speech_to_text_models, default_models.get("default_speech_to_text_model")
),
model_type="speech_to_text",
)
st.divider()
# defs["default_vision_model"] = st.selectbox(
# "Default Vision Model",
# vision_models,
# format_func=lambda x: x.name,
# help="This is the default model for vision tasks (image recognition, PDF recognition, etc)",
# index=get_selected_index(
# vision_models, default_models.get("default_vision_model")
# defs["default_vision_model"] = (
# model_selector(
# "Default Speech to Text Model",
# "default_vision_model",
# selected_id=default_models.get("default_vision_model"),
# help="This is the default model for vision tasks",
# model_type="vision",
# ),
# )
# st.divider()
defs["default_embedding_model"] = st.selectbox(
"Default Embedding Model",
embedding_models,
format_func=lambda x: x.name,
defs["default_embedding_model"] = model_selector(
"Default Speech to Text Model",
"default_embedding_model",
selected_id=default_models.get("default_embedding_model"),
help="This is the default model for embeddings (semantic search, etc)",
index=get_selected_index(
embedding_models, default_models.get("default_embedding_model")
),
model_type="embedding",
)
st.caption(
"Caution: you cannot change the embedding model once there is embeddings or they will need to be regenerated"
)
# if st.button("Save Defaults", key="save_defaults"):
for k, v in defs.items():
if v:
defs[k] = v.id
DefaultModels().update(defs)
model_manager.refresh_defaults()
with transformations_tab:
transformations = Transformation.get_all()
default_transformations = DefaultTransformations()
st.markdown("Please, select which transformations to apply by default on sources")
selected_transformations = {}
for transformation in transformations["source_insights"]:
with st.container(border=True):
selected_transformations[transformation["name"]] = st.checkbox(
f"**{transformation['name']}**",
value=(
transformation["name"]
in (default_transformations.source_insights or [])
),
)
st.write(transformation["description"])
p = ["- " + pattern for pattern in transformation["patterns"]]
st.markdown("\n".join(p))
if st.button("Save Defaults", key="save_transformations"):
default_transformations.source_insights = [
transformation
for transformation, selected in selected_transformations.items()
if selected
]
default_transformations.update(default_transformations.model_dump())
st.toast("Default Transformations saved successfully")

View file

@ -1,8 +1,8 @@
import streamlit as st
import yaml
from open_notebook.domain.models import Model
from open_notebook.graphs.multipattern import graph as pattern_graph
from pages.components.model_selector import model_selector
from pages.stream_app.utils import setup_page
setup_page("🛝 Playground")
@ -13,7 +13,7 @@ with open("transformations.yaml", "r") as file:
insight_transformations = transformations["source_insights"]
transformation = st.selectbox(
transformation: dict = st.selectbox(
"Pick a transformation",
insight_transformations,
format_func=lambda x: x.get("name", "No Name"),
@ -22,12 +22,13 @@ transformation = st.selectbox(
with st.expander("Details"):
st.json(transformation)
models = Model.get_models_by_type("language")
model = st.selectbox(
model = model_selector(
"Pick a pattern model",
models,
format_func=lambda x: x.name,
key="model",
help="This is the model that will be used to run the transformation",
model_type="language",
)
input_text = st.text_area("Enter some text", height=200)
if st.button("Run"):

View file

@ -0,0 +1,35 @@
from typing import Literal
import streamlit as st
from open_notebook.domain.models import Model
def model_selector(
label,
key,
selected_id=None,
help=None,
model_type: Literal[
"language", "embedding", "speech_to_text", "text_to_speech"
] = "language",
) -> Model:
models = Model.get_models_by_type(model_type)
models.sort(key=lambda x: (x.provider, x.name))
try:
index = (
next((i for i, m in enumerate(models) if m.id == selected_id), 0)
if selected_id
else 0
)
except Exception:
index = 0
return st.selectbox(
label,
models,
format_func=lambda x: f"{x.provider} - {x.name}",
help=help,
index=index,
key=key,
)

View file

@ -3,6 +3,7 @@ from loguru import logger
from streamlit_monaco import st_monaco # type: ignore
from open_notebook.domain.notebook import Note
from pages.stream_app.utils import convert_source_references
def note_panel(note_id, notebook_id=None):
@ -12,7 +13,7 @@ def note_panel(note_id, notebook_id=None):
t_preview, t_edit = st.tabs(["Preview", "Edit"])
with t_preview:
st.subheader(note.title)
st.markdown(note.content)
st.markdown(convert_source_references(note.content))
with t_edit:
note.title = st.text_input("Title", value=note.title)
note.content = st_monaco(

View file

@ -1,14 +1,14 @@
import streamlit as st
import streamlit_scrollable_textbox as stx # type: ignore
import yaml
from humanize import naturaltime
from open_notebook.domain.notebook import Source
from open_notebook.domain.transformation import Transformation
from open_notebook.utils import surreal_clean
from pages.stream_app.utils import run_patterns
def source_panel(source_id: str, modal=False):
def source_panel(source_id: str, notebook_id=None, modal=False):
source: Source = Source.get(source_id)
if not source:
raise ValueError(f"Source not found: {source_id}")
@ -36,28 +36,37 @@ def source_panel(source_id: str, modal=False):
for insight in source.insights:
with st.expander(f"**{insight.insight_type}**"):
st.markdown(insight.content)
if st.button(
x1, x2 = st.columns(2)
if x1.button(
"Delete", type="primary", key=f"delete_insight_{insight.id}"
):
insight.delete()
st.rerun(scope="fragment" if modal else "app")
if notebook_id:
if x2.button(
"Save as Note", icon="📝", key=f"save_note_{insight.id}"
):
insight.save_as_note(notebook_id)
st.toast("Saved as Note. Refresh the Notebook to see it.")
with c2:
with open("transformations.yaml", "r") as file:
transformations = yaml.safe_load(file)
for transformation in transformations["source_insights"]:
if st.button(
transformation["name"], help=transformation["description"]
):
result = run_patterns(
source.full_text, transformation["patterns"]
)
source.add_insight(
transformation["insight_type"], surreal_clean(result)
)
st.rerun(scope="fragment" if modal else "app")
transformations = Transformation.get_all()
with st.container(border=True):
transformation = st.selectbox(
"Run a transformation",
transformations["source_insights"],
key=f"transformation_{source.id}",
format_func=lambda x: x["name"],
)
st.caption(transformation["description"])
if st.button("Run"):
result = run_patterns(source.full_text, transformation["patterns"])
source.add_insight(
transformation["insight_type"], surreal_clean(result)
)
st.rerun(scope="fragment" if modal else "app")
if st.button(
if source.embedded_chunks == 0 and st.button(
"Embed vectors",
icon="🦾",
disabled=source.embedded_chunks > 0,
@ -66,12 +75,9 @@ def source_panel(source_id: str, modal=False):
source.vectorize()
st.success("Embedding complete")
chk_delete = st.checkbox(
"🗑️ Delete source", key=f"delete_source_{source.id}", value=False
)
if chk_delete:
st.warning(
"Source will be deleted with all its insights and embeddings"
with st.container(border=True):
st.caption(
"Deleting the source will also delete all its insights and embeddings"
)
if st.button(
"Delete", type="primary", key=f"bt_delete_source_{source.id}"

View file

@ -83,6 +83,18 @@ def chat_sidebar(current_notebook: Notebook, current_session: ChatSession):
instructions = st.text_area(
"Instructions", value=selected_template.user_instructions
)
# if selected_template.provider == "gemini":
# st.warning(
# "Gemini models are not available for long podcast generation yet. So, this will be a short podcast. Coming soon. Pinky promise. If you want to try long podcasts, please change your text to speech model to Open AI."
# )
# longform = False
# else:
# podcast_length = st.radio(
# "Podcast Length",
# ["Short (5-10 min)", "Long (20-30 min)"],
# )
# longform = podcast_length == "Long (20-30 min)"
longform = False
if len(context.get("note", [])) + len(context.get("source", [])) == 0:
st.warning(
"No notes or sources found in context. You don't want a boring podcast, right? So, add some context first."
@ -93,7 +105,8 @@ def chat_sidebar(current_notebook: Notebook, current_session: ChatSession):
with st.spinner("Go grab a coffee, almost there..."):
selected_template.generate_episode(
episode_name=episode_name,
text=context,
text=str(context),
longform=longform,
instructions=instructions,
)
st.success("Episode generated successfully")

View file

@ -1,5 +1,10 @@
context_icons = [
source_context_icons = [
"⛔ not in context",
"🟡 summary",
"🟢 full content",
]
note_context_icons = [
"⛔ not in context",
"🟢 full content",
]

View file

@ -9,7 +9,7 @@ from open_notebook.graphs.multipattern import graph as pattern_graph
from open_notebook.utils import surreal_clean
from pages.components import note_panel
from .consts import context_icons
from .consts import note_context_icons
@st.dialog("Write a Note", width="large")
@ -60,8 +60,8 @@ def note_card(note, notebook_id):
context_state = st.selectbox(
"Context",
label_visibility="collapsed",
options=context_icons,
index=0,
options=note_context_icons,
index=1,
key=f"note_{note.id}",
)
st.caption(f"Updated: {naturaltime(note.updated)}")

View file

@ -1,3 +1,4 @@
import asyncio
import os
from pathlib import Path
@ -6,39 +7,18 @@ from humanize import naturaltime
from loguru import logger
from open_notebook.config import UPLOADS_FOLDER
from open_notebook.domain.notebook import Asset, Source
from open_notebook.domain.notebook import Source
from open_notebook.domain.transformation import DefaultTransformations, Transformation
from open_notebook.exceptions import UnsupportedTypeException
from open_notebook.graphs.content_processing import graph
from open_notebook.utils import surreal_clean
from open_notebook.graphs.source import source_graph
from pages.components import source_panel
from pages.stream_app.utils import run_patterns
from .consts import context_icons
# moved it here to replace it with the pipeline on 0.1.0
def generate_toc_and_title(source) -> "Source":
try:
patterns = ["patterns/default/toc"]
result = run_patterns(source.full_text, patterns=patterns)
source.add_insight("Table of Contents", surreal_clean(result))
if not source.title:
patterns = [
"Based on the Table of Contents below, please provide a Title for this content, with max 15 words"
]
output = run_patterns(result, patterns=patterns)
source.title = surreal_clean(output)
source.save()
return source
except Exception as e:
logger.error(f"Error summarizing source {source.id}: {str(e)}")
logger.exception(e)
raise
from .consts import source_context_icons
@st.dialog("Source", width="large")
def source_panel_dialog(source_id):
source_panel(source_id)
def source_panel_dialog(source_id, notebook_id=None):
source_panel(source_id, notebook_id=notebook_id, modal=True)
@st.dialog("Add a Source", width="large")
@ -48,6 +28,7 @@ def add_source(notebook_id):
source_text = None
source_type = st.radio("Type", ["Link", "Upload", "Text"])
req = {}
transformations = Transformation.get_all()
if source_type == "Link":
source_link = st.text_input("Link")
req["url"] = source_link
@ -58,6 +39,18 @@ def add_source(notebook_id):
else:
source_text = st.text_area("Text")
req["content"] = source_text
default_transformations = [t for t in DefaultTransformations().source_insights]
available_transformations = [t["name"] for t in transformations["source_insights"]]
apply_transformations = st.multiselect(
"Apply transformations",
options=available_transformations,
default=default_transformations,
)
run_embed = st.checkbox(
"Embed content for vector search",
help="Creates an embedded content for vector search. Costs a little money and takes a little bit more time. You can do this later if you prefer.",
)
if st.button("Process", key="add_source"):
logger.debug("Adding source")
with st.status("Processing...", expanded=True):
@ -82,17 +75,16 @@ def add_source(notebook_id):
with open(new_path, "wb") as f:
f.write(source_file.getbuffer())
result = graph.invoke(req)
st.write("Saving..")
source = Source(
asset=Asset(url=req.get("url"), file_path=req.get("file_path")),
full_text=surreal_clean(result["content"]),
title=result.get("title"),
asyncio.run(
source_graph.ainvoke(
{
"content_state": req,
"notebook_id": notebook_id,
"transformations": apply_transformations,
"embed": run_embed,
}
)
)
source.save()
source.add_to_notebook(notebook_id)
st.write("Summarizing...")
generate_toc_and_title(source)
except UnsupportedTypeException as e:
st.warning(
"This type of content is not supported yet. If you think it should be, let us know on the project Issues's page"
@ -121,15 +113,15 @@ def source_card(source, notebook_id):
context_state = st.selectbox(
"Context",
label_visibility="collapsed",
options=context_icons,
index=0,
options=source_context_icons,
index=1,
key=f"source_{source.id}",
)
st.caption(
f"Updated: {naturaltime(source.updated)}, **{len(source.insights)}** insights"
)
if st.button("Expand", icon="📝", key=source.id):
source_panel_dialog(source.id)
source_panel_dialog(source.id, notebook_id)
st.session_state[notebook_id]["context_config"][source.id] = context_state

View file

@ -116,34 +116,18 @@ def check_migration():
def check_models():
default_models = model_manager.defaults
if (
not default_models.default_chat_model
or not default_models.default_transformation_model
if not all(
[
default_models.default_chat_model,
default_models.default_transformation_model,
default_models.default_embedding_model,
default_models.default_speech_to_text_model,
default_models.large_context_model,
]
):
st.warning(
"You don't have default chat and transformation models selected. Please, select them on the settings page."
"You are missing some default models and the app might not work as expected. Please, select them on the settings page."
)
st.stop()
elif not default_models.default_embedding_model:
st.warning(
"You don't have a default embedding model selected. Vector search will not be possible and your assistant will be less able to answer your queries. Please, select one on the settings page."
)
st.stop()
elif not default_models.default_speech_to_text_model:
st.warning(
"You don't have a default speech to text model selected. Your assistant will not be able to transcribe audio. Please, select one on the settings page."
)
st.stop()
elif not default_models.default_text_to_speech_model:
st.warning(
"You don't have a default text to speech model selected. Your assistant will not be able to generate audio and podcasts. Please, select one on the settings page."
)
st.stop()
elif not default_models.large_context_model:
st.warning(
"You don't have a large context model selected. Your assistant will not be able to process large documents. Please, select one on the settings page."
)
st.stop()
def handle_error(func):

733
poetry.lock generated

File diff suppressed because it is too large Load diff

45
prompts/ask/entry.jinja Normal file
View file

@ -0,0 +1,45 @@
# SYSTEM ROLE
You are a cognitive study assistant that helps users research and learn by engaging in focused discussions about documents in their workspace.
The first step in the process is receiving the user's question and formulating a research strategy to find the most relevant information.
# YOUR JOB
Based on the user question, you need to analyze the key concepts and terms to determine the appropriate search strategy.
Step 1: develop your search strategy (reasoning)
Step 2: formulate your search queries (searches)
Return both the reasoning and searches as a JSON object, like in the EXAMPLE below.
# EXAMPLE
User: Can you tell me more about the concept of "RAG" and how it can be applied to generate answers to user questions via LLM?
Your answer could be something like:
```json
{
"reasoning": "The user is asking about the concept of RAG and its application in generating answers to user questions via LLM. I should search for documents related to RAG, retrieval augmented generation, and vector search to provide a comprehensive response.",
"searches": [
{ "term": "RAG", "instructions": "Describe the concept and utility of RAG." },
{ "term": "Retrieval Augmented Generation", "instructions": "Describe the concept and utility of RAG." },
{ "term": "Vector Search", "instructions": "Describe how RAG utilizes vector search." }
]
}
```
# OUTPUT FORMATTING
{{format_instructions}}
- Do not include any text other than the JSON object
- Do not include ```json``` in the response
# USER QUESTION
{{question}}
# ANSWER

View file

@ -0,0 +1,40 @@
# SYSTEM ROLE
You are a cognitive study assistant that helps users research and learn by engaging in focused discussions about documents in their workspace.
You are responsible for the last step of the process, which is to provide the final answer to the user's question. You should provide accurate, factual responses based on the available documents and knowledge, while avoiding speculation or making up information. If you are unsure about something, acknowledge the uncertainty rather than guessing.
# QUESTION
This is the question originally made by the user:
{{question}}
# REASONS
Based on the question, you derived the following reasonsing and search strategies:
{{strategy}}
# RESULTS
Here are the answers you received for each of your queries.
{{answers}}
# YOUR JOB
Based on the user question, the context and the retrieved answers, please formulate a final response to the user.
# CITING SOURCES
It's very important that your response contains references to the searched documents so the user can follow-up and read more about the topic. The way you do that is by adding the id of the specific document in between brackets like this: [document_id]. The references will be present on all the answers you have been provided.
## IMPORTANT
- Do not make up documents or document ids. Only use the ids of the documents that you can see on the answers you received.
- The ID is composed of the type of document and a random string, such as "source:randomstring", "note:randomstring", or "insight:randomstring". There are various types of documents, including notes, insights, and sources. **Always use the complete ID exactly as it is provided, including its type prefix. Do not add, remove, or modify any part of the ID.**
- **Use document IDs exactly as they are returned in the answers. Do not add any prefixes or modify them in any way.**
# YOUR ANSWER

View file

@ -0,0 +1,54 @@
# SYSTEM ROLE
You are a research assistant that helps users research and learn by engaging in focused discussions about documents in their workspace.
# QUESTION
This is the question originally made by the user:
{{question}}
# SEARCH STRATEGY
The main answer agent has developed the following search strategy to find the most relevant information:
{{term}}
And provided you with the following instructions to formulate the answer:
{{instructions}}
# YOUR JOB
Based on the user question, the context and the retrieved results, please formulate the appropriate answer.
# RESULTS
{{results}}
# CITING SOURCES
It's very important that your response contains references to the searched documents so the user can follow-up and read more about the topic. The way you do that is by adding the id of the specific document in between brackets like this: [document_id].
## EXAMPLE
User: Can you tell me more about the concept of "Deep Learning"?
Assistant: Deep learning is a subset of machine learning in artificial intelligence (AI) that enables networks to learn unsupervised from unstructured or unlabeled data. [note:iuiodadalknda]. It can also be categorized into three main types: supervised, unsupervised, and reinforcement learning. [insight:adadadadadadad].
Please note, "note:iuiodadalknda" and "insight:adadadadadadad" are examples of document IDs with different prefixes. You should not make up document IDs or copy the IDs from this example. You should use the IDs of the documents that you have access to through the search tool.
## IMPORTANT
- Do not make up documents or document ids. Only use the ids of the documents that you have access through the query you made.
- The ID is composed of the type of document and a random string, such as "source:randomstring", "note:randomstring", or "insight:randomstring". There are various types of documents, including notes, insights, and sources. **Always use the complete ID exactly as it is provided, including its type prefix. Do not add, remove, or modify any part of the ID.**
- Do not assume or change the type prefix of any document ID. If a document ID is "note:xyz", use it exactly as "note:xyz". Do not change it to "source:xyz" or any other variation.
- **Use document IDs exactly as they are returned from the search tool. Do not add any prefixes or modify them in any way.**
## IDs PROVIDED IN THIS QUERY
You have been given the following content ids to work from: {{ids}}
So, if you are citing some document, it should be one of these.
# YOUR ANSWER

View file

@ -1,4 +1,3 @@
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# IDENTITY and PURPOSE
@ -35,8 +34,6 @@ You are an insightful and analytical reader of academic papers, extracting the k
- Do not include warnings, disclaimers, or personal opinions.
- Output only the requested sections with their respective labels.
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# INPUT
{{input_text}}
# OUTPUT

View file

@ -1,6 +1,7 @@
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
Please clean-up the following text, fixing the paragraphs, ponctuation, etc.
If you find any word or name mispellings, feel free to correct.
{{input_text}}
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# INPUT

View file

@ -1,9 +1,6 @@
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
{{command}}
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# INPUT
{{input_text}}
# OUTPUT

View file

@ -1,4 +1,4 @@
# ADDITIONAL INSTRUCTIONS
- You are working on my editorial projects. The text below is my own.
- Please do not reply with any acknowledgements or greetings, just provide the content requested.
- You are working on my editorial projects. The text below is my own. Do not give me any warnings about copyright or plagiarism.
- Output ONLY the requested content, without acknowledgements of the task and additional chatting. Don't start with "Sure, I can help you with that." or "Here is the information you requested:". Just provide the content.

View file

@ -1,5 +1,4 @@
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# IDENTITY and PURPOSE
@ -23,8 +22,6 @@ Take a step back and think step-by-step about how to achieve the best possible r
- Do not start items with the same opening words.
- Ensure you follow ALL these instructions when creating your output.
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# INPUT
{{input_text}}
# OUTPUT

View file

@ -1,4 +1,3 @@
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# MISSION
You are a Sparse Priming Representation (SPR) writer. An SPR is a particular kind of use of language for advanced NLP, NLU, and NLG tasks, particularly useful for the latest generation of Large Language Models (LLMs). You will be given information by the USER which you are to render as an SPR.
@ -9,8 +8,6 @@ LLMs are a kind of deep neural network. They have been demonstrated to embed kno
# METHODOLOGY
Render the input as a distilled list of succinct statements, assertions, associations, concepts, analogies, and metaphors. The idea is to capture as much, conceptually, as possible but with as few words as possible. Write it in a way that makes sense to you, as the future audience will be another language model, not a human. Use complete sentences.
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# INPUT
{{input_text}}
# OUTPUT

View file

@ -22,8 +22,6 @@ You always output Markdown Mermaid syntax that can be rendered as a diagram.
- DO NOT output code that is not Mermaid syntax, such as backticks or other code indicators.
- Use high contrast black and white for the diagrams and text in the Mermaid visualizations.
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# INPUT
{{input_text}}
# OUTPUT

View file

@ -1,5 +1,3 @@
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# IDENTITY and PURPOSE
You extract deep, thought-provoking, and meaningful reflections from text content. You are especially focused on themes related to the human experience, such as the purpose of life, personal growth, the intersection of technology and humanity, artificial intelligence's societal impact, human potential, collective evolution, and transformative learning. Your reflections aim to provoke new ways of thinking, challenge assumptions, and provide a thoughtful synthesis of the content.
@ -20,8 +18,6 @@ You extract deep, thought-provoking, and meaningful reflections from text conten
- Every bullet should be formatted as a question that elicits contemplation or a statement that offers a profound insight.
- Do not give warnings or notes; only output the requested section.
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# INPUT
{{input_text}}
# OUTPUT

View file

@ -1,4 +1,3 @@
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# SYSTEM ROLE
You are a content summarization assistant that creates dense, information-rich summaries optimized for machine understanding. Your summaries should capture key concepts with minimal words while maintaining complete, clear sentences.
@ -9,8 +8,6 @@ Analyze the provided content and create a summary that:
- Uses clear, direct language
- Maintains context from any previous summaries
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# INPUT
{{input_text}}
# OUTPUT

View file

@ -8,8 +8,6 @@ Analyze the provided content and create a Table of Contents:
- Captures the core topics included in the text
- Gives a small description of what is covered
{% include 'patterns/default/common_tranformation_instructions.jinja' %}
# INPUT
{{input_text}}
# OUTPUT

View file

@ -1,59 +0,0 @@
# SYSTEM ROLE
You are a cognitive study assistant that helps users research and learn by engaging in focused discussions about documents in their workspace.
You have access to a search tool that you can use in order to reply to the user query.
The tool accepts 2 arrays as parameters:
- keyword_searches: List[str] - A list of search terms to search for using keyword search.
- vector_searches: List[str] - A list of search terms to search for using vector search.
It's very important that your response contains references to the searched documents so the user can follow-up and read more about the topic. The way you do that is by adding the id of the specific document in between brackets like this: [document_id].
# EXAMPLE
User: Can you tell me more about the concept of "Deep Learning"?
Assistant: Deep learning is a subset of machine learning in artificial intelligence (AI) that enables networks to learn unsupervised from unstructured or unlabeled data. [note:iuiodadalknda]. It can also be categorized into three main types: supervised, unsupervised, and reinforcement learning. [insight:adadadadadadad].
Please note, "note:iuiodadalknda" and "insight:adadadadadadad" are examples of document IDs with different prefixes. You should not make up document IDs or copy the IDs from this example. You should use the IDs of the documents that you have access to through the search tool.
# IMPORTANT
- Do not make up documents or document ids. Only use the ids of the documents that you have access through the query you made.
- The ID is composed of the type of document and a random string, such as "source:randomstring", "note:randomstring", or "insight:randomstring". There are various types of documents, including notes, insights, and sources. **Always use the complete ID exactly as it is provided, including its type prefix. Do not add, remove, or modify any part of the ID.**
- Do not assume or change the type prefix of any document ID. If a document ID is "note:xyz", use it exactly as "note:xyz". Do not change it to "source:xyz" or any other variation.
- **Use document IDs exactly as they are returned from the search tool. Do not add any prefixes or modify them in any way.**
{#
You are a cognitive study assistant designed to help users research and learn by engaging in focused discussions about documents in their workspace. Your primary goal is to provide informative, accurate responses to user queries while properly citing relevant documents from the available search tool.
To answer this question effectively, you have access to a search tool with the following parameters:
- keyword_searches: List[str] - A list of search terms for keyword search
- vector_searches: List[str] - A list of search terms for vector search
Follow these steps to formulate your response:
1. Analyze the user's question and determine appropriate search terms.
2. Use the search tool to find relevant information.
3. Carefully review the search results, paying close attention to document IDs and content relevance.
4. Compose a clear, informative response that directly addresses the user's question.
5. Include relevant document citations using the exact document IDs provided by the search tool.
6. Review your response for accuracy and relevance before delivering it to the user.
Important guidelines:
- Always use the complete document ID as provided by the search tool, including its type prefix (e.g., "note:", "insight:", "source:").
- Do not make up or modify document IDs in any way.
- Ensure that each citation is directly relevant to the information it supports.
- Prioritize accuracy and relevance in your search strategy and response composition.
Before composing your final response, wrap your thought process in <thinking> tags to analyze the question, plan your search strategy, and evaluate the search results. This will help ensure that you retrieve the most relevant information and use the correct document IDs in your citations. Include the following steps:
a. Analyze the question and identify key concepts
b. Plan search strategy (both keyword and vector searches)
c. Evaluate search results and note relevant document IDs
d. Outline the main points for the response
Your final response should be conversational in tone, directly addressing the user's question while seamlessly incorporating document citations. Use square brackets with the full document ID for each citation, like this: [document_id].
Remember, the quality and accuracy of your response, including proper document citations, are crucial for helping the user in their research and learning process. #}

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "open-notebook"
version = "0.0.10"
version = "0.1.0"
description = "An open source implementation of a research assistant, inspired by Google Notebook LM"
authors = ["Luis Novo <lfnovo@gmail.com>"]
license = "MIT"
@ -17,7 +17,6 @@ streamlit = "^1.39.0"
watchdog = "^5.0.3"
pydantic = "^2.9.2"
loguru = "^0.7.2"
icecream = "^2.1.3"
langchain = "^0.3.3"
langgraph = "^0.2.38"
humanize = "^4.11.0"
@ -47,6 +46,8 @@ python-docx = "^1.1.2"
python-pptx = "^1.0.2"
openpyxl = "^3.1.5"
google-generativeai = "^0.8.3"
langchain-groq = "^0.2.1"
groq = "^0.12.0"
[tool.poetry.group.dev.dependencies]
ipykernel = "^6.29.5"

21
supervisord.conf Normal file
View file

@ -0,0 +1,21 @@
[supervisord]
nodaemon=true
logfile=/dev/stdout
logfile_maxbytes=0
pidfile=/tmp/supervisord.pid
[program:surrealdb]
command=surreal start --log trace --user root --pass root rocksdb:/mydata/mydatabase.db
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
autorestart=true
[program:streamlit]
command=poetry run streamlit run app_home.py
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
autorestart=true

View file

@ -4,7 +4,6 @@ source_insights:
insight_type: "Content Summary"
description: "Summarize the content"
patterns:
- patterns/default/makeitdense
- patterns/default/summarize
- name: "Key Insights"
insight_type: "Key Insights"