feat: detect HTML content in clipboard for text sources (#475)

* chore: bump content-core to support html to markdown

* feat: detect HTML content in clipboard for text sources

- Add paste handler to detect text/html format in clipboard
- Use HTML content instead of plain text when available
- Display info message when HTML is detected
- Add translations for all supported languages (en-US, pt-BR, ja-JP, zh-CN, zh-TW)

* fix: reset HTML detection banner on plain text paste

Clear the hasHtmlContent flag when pasting plain text (no HTML in
clipboard) so the banner doesn't persist incorrectly after replacing
HTML content with plain text.
This commit is contained in:
Luis Novo 2026-01-25 21:36:58 -03:00 committed by GitHub
parent a329806a33
commit 6dc9a3db50
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 73 additions and 8 deletions

View file

@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Added
- HTML clipboard detection for text sources (#426)
- When pasting content, automatically detects HTML format (e.g., from Word, web pages)
- Shows info message when HTML is detected, informing user it will be converted to Markdown
- Preserves formatting that would be lost with plain text paste
- Bump content-core to 0.11.0 for HTML to Markdown conversion support
## [1.6.2] - 2026-01-24
### Fixed

View file

@ -126,6 +126,7 @@ export function AddSourceDialog({
handleSubmit,
control,
watch,
setValue,
formState: { errors },
reset,
} = useForm<CreateSourceFormData>({
@ -553,6 +554,7 @@ export function AddSourceDialog({
// @ts-expect-error - Type inference issue with zod schema
control={control}
register={register}
setValue={setValue}
// @ts-expect-error - Type inference issue with zod schema
errors={errors}
urlValidationErrors={urlValidationErrors}

View file

@ -1,7 +1,7 @@
"use client"
import { useMemo } from "react"
import { Control, FieldErrors, UseFormRegister, useWatch } from "react-hook-form"
import { useMemo, useState } from "react"
import { Control, FieldErrors, UseFormRegister, UseFormSetValue, useWatch } from "react-hook-form"
import { FileIcon, LinkIcon, FileTextIcon } from "lucide-react"
import { useTranslation } from "@/lib/hooks/use-translation"
import { FormSection } from "@/components/ui/form-section"
@ -89,6 +89,7 @@ const getSourceTypes = (t: TranslationKeys) => [
interface SourceTypeStepProps {
control: Control<CreateSourceFormData>
register: UseFormRegister<CreateSourceFormData>
setValue: UseFormSetValue<CreateSourceFormData>
errors: FieldErrors<CreateSourceFormData>
urlValidationErrors?: { url: string; line: number }[]
onClearUrlErrors?: () => void
@ -96,13 +97,39 @@ interface SourceTypeStepProps {
const MAX_BATCH_SIZE = 50
export function SourceTypeStep({ control, register, errors, urlValidationErrors, onClearUrlErrors }: SourceTypeStepProps) {
export function SourceTypeStep({ control, register, setValue, errors, urlValidationErrors, onClearUrlErrors }: SourceTypeStepProps) {
const { t } = useTranslation()
// Watch the selected type and inputs to detect batch mode
const selectedType = useWatch({ control, name: 'type' })
const urlInput = useWatch({ control, name: 'url' })
const fileInput = useWatch({ control, name: 'file' })
// Track if HTML content was pasted
const [hasHtmlContent, setHasHtmlContent] = useState(false)
// Handle paste event to check for HTML content in clipboard
const handleTextPaste = (event: React.ClipboardEvent<HTMLTextAreaElement>) => {
const htmlContent = event.clipboardData.getData('text/html')
// If HTML content is available, use it instead of plain text
if (htmlContent) {
event.preventDefault()
// Get current content and cursor position
const textarea = event.currentTarget
const start = textarea.selectionStart
const end = textarea.selectionEnd
const currentValue = textarea.value
// Insert HTML content at cursor position (replacing selection if any)
const newValue = currentValue.substring(0, start) + htmlContent + currentValue.substring(end)
setValue('content', newValue, { shouldValidate: true })
setHasHtmlContent(true)
} else {
// Plain text paste - clear the HTML indicator
setHasHtmlContent(false)
}
}
// Batch mode detection
const { isBatchMode, itemCount, urlCount, fileCount } = useMemo(() => {
let urlCount = 0
@ -258,11 +285,19 @@ export function SourceTypeStep({ control, register, errors, urlValidationErrors,
{type.value === 'text' && (
<div>
<Label htmlFor="content" className="mb-2 block">{t.sources.textContentLabel}</Label>
{hasHtmlContent && (
<div className="mb-2 p-2 bg-blue-50 dark:bg-blue-950 border border-blue-200 dark:border-blue-800 rounded-md">
<p className="text-sm text-blue-700 dark:text-blue-300">
{t.sources.htmlDetected}
</p>
</div>
)}
<Textarea
id="content"
{...register('content')}
placeholder={t.sources.textPlaceholder}
rows={6}
onPaste={handleTextPaste}
/>
{errors.content && (
<p className="text-sm text-destructive mt-1">{errors.content.message}</p>

View file

@ -379,6 +379,7 @@ export const enUS = {
selectMultipleFilesHint: 'Select multiple files to batch import. Supported: Documents (PDF, DOC, DOCX, PPT, XLS, EPUB, TXT, MD), Media (MP4, MP3, WAV, M4A), Images (JPG, PNG), Archives (ZIP)',
selectedFiles: 'Selected files:',
textPlaceholder: 'Paste or type your content here...',
htmlDetected: 'HTML content detected. It will be converted to Markdown after processing.',
titlePlaceholder: 'Give your source a descriptive title',
batchTitlesAuto: 'Titles will be automatically generated for each source.',
batchCommonSettings: 'The same notebooks and transformations will be applied to all items.',

View file

@ -379,6 +379,7 @@ export const jaJP = {
selectMultipleFilesHint: '複数ファイルを選択して一括インポート。対応形式ドキュメントPDF、DOC、DOCX、PPT、XLS、EPUB、TXT、MD、メディアMP4、MP3、WAV、M4A、画像JPG、PNG、アーカイブZIP',
selectedFiles: '選択されたファイル:',
textPlaceholder: 'コンテンツを貼り付けまたは入力...',
htmlDetected: 'HTMLコンテンツが検出されました。処理後にMarkdownに変換されます。',
titlePlaceholder: 'ソースにわかりやすいタイトルを付けてください',
batchTitlesAuto: 'タイトルは各ソースごとに自動生成されます。',
batchCommonSettings: '同じノートブックとトランスフォーメーションがすべてのアイテムに適用されます。',

View file

@ -379,6 +379,7 @@ export const ptBR = {
selectMultipleFilesHint: "Selecione múltiplos arquivos para importação em lote. Suportados: Documentos (PDF, DOC, DOCX, PPT, XLS, EPUB, TXT, MD), Mídia (MP4, MP3, WAV, M4A), Imagens (JPG, PNG), Arquivos (ZIP)",
selectedFiles: "Arquivos selecionados:",
textPlaceholder: "Cole ou digite seu conteúdo aqui...",
htmlDetected: "Conteúdo HTML detectado. Será convertido para Markdown após o processamento.",
titlePlaceholder: "Dê um título descritivo para sua fonte",
batchTitlesAuto: "Os títulos serão gerados automaticamente para cada fonte.",
batchCommonSettings: "Os mesmos cadernos e transformações serão aplicados a todos os itens.",

View file

@ -402,6 +402,7 @@ export const zhCN = {
selectMultipleFilesHint: '选择多个文件进行批量导入。支持:文档 (PDF, DOC, DOCX, PPT, XLS, EPUB, TXT, MD),媒体 (MP4, MP3, WAV, M4A),图片 (JPG, PNG),归档 (ZIP)',
selectedFiles: '已选择文件:',
textPlaceholder: '在此处粘贴或输入您的内容...',
htmlDetected: '检测到 HTML 内容。处理后将转换为 Markdown。',
titlePlaceholder: '为您的来源起一个描述性的标题',
batchTitlesAuto: '将为每个来源自动生成标题。',
batchCommonSettings: '同样的笔记本和转换将应用于所有项目。',

View file

@ -402,6 +402,7 @@ export const zhTW = {
selectMultipleFilesHint: '選擇多個檔案進行批次導入。支援:文件 (PDF, DOC, DOCX, PPT, XLS, EPUB, TXT, MD),媒體 (MP4, MP3, WAV, M4A),圖片 (JPG, PNG),歸檔 (ZIP)',
selectedFiles: '已選擇檔案:',
textPlaceholder: '在此處貼上或輸入您的內容...',
htmlDetected: '偵測到 HTML 內容。處理後將轉換為 Markdown。',
titlePlaceholder: '為您的來源取一個描述性的標題',
batchTitlesAuto: '將為每個來源自動生成標題。',
batchCommonSettings: '相同的筆記本和轉換將應用於所有項目。',

View file

@ -33,7 +33,7 @@ dependencies = [
"tomli>=2.0.2",
"python-dotenv>=1.0.1",
"httpx[socks]>=0.27.0",
"content-core>=1.0.2,<2",
"content-core>=1.13,<2",
"ai-prompter>=0.3,<1",
"esperanto>=2.17.2,<3",
"surrealdb>=1.0.4",

22
uv.lock
View file

@ -432,7 +432,7 @@ wheels = [
[[package]]
name = "content-core"
version = "1.10.0"
version = "1.13.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "ai-prompter" },
@ -447,6 +447,7 @@ dependencies = [
{ name = "langdetect" },
{ name = "langgraph" },
{ name = "loguru" },
{ name = "markdownify" },
{ name = "moviepy" },
{ name = "openpyxl" },
{ name = "pandas" },
@ -461,9 +462,9 @@ dependencies = [
{ name = "validators" },
{ name = "youtube-transcript-api" },
]
sdist = { url = "https://files.pythonhosted.org/packages/4a/b5/1322ad0b3d9eb86bbc8efe76e3e569ad32caaf7769973ad166b21468857f/content_core-1.10.0.tar.gz", hash = "sha256:e8f83b5675b24b0b8a38dad0dff5a878b9efe2d5e00b4d71ea3e3073d2eff000", size = 20737121, upload-time = "2026-01-16T20:12:51.308Z" }
sdist = { url = "https://files.pythonhosted.org/packages/c4/0d/53f37e46550d2a75ded08b1eccb2fccdd982fa7250b9da32e67d2592abf7/content_core-1.13.0.tar.gz", hash = "sha256:e101114517e6d4b6356f8ae5ca2ab88cd2acc0e20fe3944d470440e338e5550a", size = 20744666, upload-time = "2026-01-26T00:00:34.576Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/3c/bc/1573354487143af2f3d401454f5801ccbd8a9e426e3d7b5095453ddbb6da/content_core-1.10.0-py3-none-any.whl", hash = "sha256:e8c4ef011224a376b719a73243f9a432e7e4694049e77129eae62370e7e93152", size = 183303, upload-time = "2026-01-16T20:12:49.116Z" },
{ url = "https://files.pythonhosted.org/packages/ac/06/7303eafde48316fde34822e8cd750d9f93a281a0cef13b26ac07b026dbf2/content_core-1.13.0-py3-none-any.whl", hash = "sha256:5c8d3a21e62c0dd1b001bdc3caf29449298fb493243659401bf4bc13f46be59b", size = 188220, upload-time = "2026-01-26T00:00:37.434Z" },
]
[[package]]
@ -2067,6 +2068,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
]
[[package]]
name = "markdownify"
version = "1.2.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "six" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3f/bc/c8c8eea5335341306b0fa7e1cb33c5e1c8d24ef70ddd684da65f41c49c92/markdownify-1.2.2.tar.gz", hash = "sha256:b274f1b5943180b031b699b199cbaeb1e2ac938b75851849a31fd0c3d6603d09", size = 18816, upload-time = "2025-11-16T19:21:18.565Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/43/ce/f1e3e9d959db134cedf06825fae8d5b294bd368aacdd0831a3975b7c4d55/markdownify-1.2.2-py3-none-any.whl", hash = "sha256:3f02d3cc52714084d6e589f70397b6fc9f2f3a8531481bf35e8cc39f975e186a", size = 15724, upload-time = "2025-11-16T19:21:17.622Z" },
]
[[package]]
name = "markupsafe"
version = "3.0.3"
@ -2429,7 +2443,7 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "ai-prompter", specifier = ">=0.3,<1" },
{ name = "content-core", specifier = ">=1.0.2,<2" },
{ name = "content-core", specifier = ">=1.13,<2" },
{ name = "esperanto", specifier = ">=2.17.2,<3" },
{ name = "fastapi", specifier = ">=0.104.0" },
{ name = "httpx", extras = ["socks"], specifier = ">=0.27.0" },