fix(jsonl): align count-only baseline parsing
This commit is contained in:
parent
7518b5af1d
commit
43afc9f907
2 changed files with 70 additions and 4 deletions
|
|
@ -267,17 +267,38 @@ function isCountableJsonlEntryLine(line: string): boolean {
|
|||
message?: unknown;
|
||||
};
|
||||
|
||||
if (typeof entry.uuid !== 'string' || !parseMessageType(String(entry.type))) {
|
||||
const type = typeof entry.type === 'string' ? parseMessageType(entry.type) : null;
|
||||
if (typeof entry.uuid !== 'string' || entry.uuid.length === 0 || !type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (entry.type === 'user' || entry.type === 'assistant') {
|
||||
return entry.message != null && typeof entry.message === 'object';
|
||||
if (type === 'user') {
|
||||
if (entry.message == null) {
|
||||
return false;
|
||||
}
|
||||
const content = (entry.message as { content?: unknown }).content;
|
||||
return content == null || isParserSafeContent(content);
|
||||
}
|
||||
|
||||
if (type === 'assistant') {
|
||||
if (!isJsonObjectRecord(entry.message)) {
|
||||
return false;
|
||||
}
|
||||
const content = entry.message.content;
|
||||
return isParserSafeContent(content);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
function isJsonObjectRecord(value: unknown): value is Record<string, unknown> {
|
||||
return value != null && typeof value === 'object' && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function isParserSafeContent(value: unknown): boolean {
|
||||
return typeof value === 'string' || (Array.isArray(value) && value.every((item) => item != null));
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Entry Parsing
|
||||
// =============================================================================
|
||||
|
|
|
|||
|
|
@ -212,10 +212,49 @@ describe('jsonl', () => {
|
|||
timestamp: '2026-01-01T00:00:02.000Z',
|
||||
content: 'system line',
|
||||
});
|
||||
const validUserWithoutContent = JSON.stringify({
|
||||
type: 'user',
|
||||
uuid: 'u1',
|
||||
timestamp: '2026-01-01T00:00:03.000Z',
|
||||
message: {
|
||||
role: 'user',
|
||||
},
|
||||
});
|
||||
const validUserArrayMessage = JSON.stringify({
|
||||
type: 'user',
|
||||
uuid: 'u2',
|
||||
timestamp: '2026-01-01T00:00:04.000Z',
|
||||
message: [],
|
||||
});
|
||||
const invalidMissingMessage = JSON.stringify({
|
||||
type: 'assistant',
|
||||
uuid: 'bad-assistant',
|
||||
});
|
||||
const invalidEmptyUuid = JSON.stringify({
|
||||
type: 'system',
|
||||
uuid: '',
|
||||
content: 'empty uuid',
|
||||
});
|
||||
const invalidAssistantMissingContent = JSON.stringify({
|
||||
type: 'assistant',
|
||||
uuid: 'bad-assistant-content',
|
||||
message: {
|
||||
role: 'assistant',
|
||||
},
|
||||
});
|
||||
const invalidAssistantArrayMessage = JSON.stringify({
|
||||
type: 'assistant',
|
||||
uuid: 'bad-assistant-array',
|
||||
message: [],
|
||||
});
|
||||
const invalidAssistantNullContentBlock = JSON.stringify({
|
||||
type: 'assistant',
|
||||
uuid: 'bad-assistant-null-block',
|
||||
message: {
|
||||
role: 'assistant',
|
||||
content: [null],
|
||||
},
|
||||
});
|
||||
const unknownType = JSON.stringify({
|
||||
type: 'unknown',
|
||||
uuid: 'unknown-1',
|
||||
|
|
@ -228,7 +267,13 @@ describe('jsonl', () => {
|
|||
[
|
||||
validAssistant,
|
||||
validSystem,
|
||||
validUserWithoutContent,
|
||||
validUserArrayMessage,
|
||||
invalidMissingMessage,
|
||||
invalidEmptyUuid,
|
||||
invalidAssistantMissingContent,
|
||||
invalidAssistantArrayMessage,
|
||||
invalidAssistantNullContentBlock,
|
||||
unknownType,
|
||||
'not json',
|
||||
partialJson,
|
||||
|
|
@ -239,7 +284,7 @@ describe('jsonl', () => {
|
|||
const parsed = await parseJsonlFileWithStats(filePath);
|
||||
const counted = await countJsonlFileWithStats(filePath);
|
||||
|
||||
expect(parsed.messages.map((message) => message.uuid)).toEqual(['a1', 's1']);
|
||||
expect(parsed.messages.map((message) => message.uuid)).toEqual(['a1', 's1', 'u1', 'u2']);
|
||||
expect(counted.parsedLineCount).toBe(parsed.parsedLineCount);
|
||||
expect(counted.consumedBytes).toBe(parsed.consumedBytes);
|
||||
} finally {
|
||||
|
|
|
|||
Loading…
Reference in a new issue