Open-scribe · olehokilko-alt · Feb 28, 2026
@@ -0,0 +1,32 @@
+# feat(pipeline): add verification module
+
+Adds a standalone verification library at `packages/pipeline/verification/`.
+
+Basically it checks clinical notes against the source transcript using token matching - sees if the claims in the note are actually supported by what was said.
+
+## whats in here
+
+- `types.ts` - types for claims, verdicts, etc
+- `verifier.ts` - core matching logic (tokenize, overlap calc)
+- `note-verifier.ts` - main `verifyNote()` function
+- tests for both
+
+## whats NOT touched
+
+Nothing. This is new code only, no changes to existing files.
+
+- no tsconfig changes
+- no storage type changes  
+- no pipeline wiring
+
+## safe to merge
+
+Its completely isolated. Just a library sitting in its own folder.
+
+## testing
+
+```bash
+npx tsx --test packages/pipeline/verification/src/__tests__/*.test.ts
+```
+
+13 tests, all pass.
@@ -0,0 +1,36 @@
+# verification
+
+Validates clinical notes against source transcripts using token matching.
+
+Not wired into the pipeline yet - just a standalone lib.
+
+## quick example
+
+```typescript
+import { verifyNote } from './src/note-verifier'
+
+const result = await verifyNote(
+  'Patient has headache for 3 days.',
+  'Patient reported headache lasting 3 days.'
+)
+
+console.log(result.status)  // 'verified' | 'partial' | 'failed'
+```
+
+## how it works
+
+1. Split note into sentences (claims)
+2. Classify each (fact, inference, opinion, etc)
+3. Match against transcript chunks
+4. Score based on token overlap + number coverage
+
+## exports
+
+- `verifyNote(note, transcript, opts?)` - main api
+- `tokenize`, `extractNumbers`, `calculateOverlap`, `classifyClaim` - utils
+
+## run tests
+
+```bash
+npx tsx --test packages/pipeline/verification/src/__tests__/*.test.ts
+```
@@ -0,0 +1,46 @@
+import { describe, it } from 'node:test'
+import assert from 'node:assert'
+import { verifyNote } from '../note-verifier'
+
+const sampleTranscript = `
+Doctor: Good morning, what brings you in today?
+Patient: I've been having this really bad headache for the past 3 days.
+Doctor: Pain severity?
+Patient: About 7 or 8 out of 10.
+Doctor: Blood pressure is 128/82, temperature 98.4.
+`
+
+const goodNote = `Patient presents with headache for 3 days. Pain severity 7-8/10. Vitals: BP 128/82.`
+const badNote = `Patient presents with chest pain for 5 days. BP 180/110.`
+
+describe('verifyNote', () => {
+  it('verifies matching note', async () => {
+    const result = await verifyNote(goodNote, sampleTranscript)
+    assert.ok(['verified', 'partial'].includes(result.status))
+    assert.ok(result.summary.overallConfidence > 0.3)
+    assert.ok(result.claims.length > 0)
+  })
+
+  it('flags mismatch', async () => {
+    const result = await verifyNote(badNote, sampleTranscript)
+    assert.ok(result.summary.overallConfidence < 0.3)
+  })
+
+  it('handles empty note', async () => {
+    const result = await verifyNote('', sampleTranscript)
+    assert.strictEqual(result.claims.length, 0)
+    assert.strictEqual(result.status, 'verified')
+  })
+
+  it('handles empty transcript', async () => {
+    const result = await verifyNote(goodNote, '')
+    assert.ok(result.summary.overallConfidence < 0.5)
+  })
+
+  it('respects factsOnly', async () => {
+    const result = await verifyNote(goodNote, sampleTranscript, { factsOnly: true })
+    for (const claim of result.claims) {
+      assert.strictEqual(claim.kind, 'fact')
+    }
+  })
+})
@@ -0,0 +1,47 @@
+import { describe, it } from 'node:test'
+import assert from 'node:assert'
+import { tokenize, extractNumbers, calculateOverlap, classifyClaim } from '../verifier'
+
+describe('tokenize', () => {
+  it('extracts tokens, filters stopwords', () => {
+    const tokens = tokenize('Patient reports headache for 3 days')
+    assert.ok(tokens.includes('headache'))
+    assert.ok(!tokens.includes('for'))
+  })
+
+  it('handles empty', () => {
+    assert.deepStrictEqual(tokenize(''), [])
+  })
+})
+
+describe('extractNumbers', () => {
+  it('extracts numbers and decimals', () => {
+    const numbers = extractNumbers('BP 120/80, temp 98.6')
+    assert.ok(numbers.includes('120'))
+    assert.ok(numbers.includes('98.6'))
+  })
+})
+
+describe('calculateOverlap', () => {
+  it('returns 1.0 for same text', () => {
+    assert.strictEqual(calculateOverlap('severe headache', 'severe headache'), 1.0)
+  })
+
+  it('returns 0 for no match', () => {
+    assert.strictEqual(calculateOverlap('headache pain', 'cardiac issues'), 0)
+  })
+})
+
+describe('classifyClaim', () => {
+  it('identifies facts', () => {
+    assert.strictEqual(classifyClaim('Patient has hypertension.'), 'fact')
+  })
+
+  it('identifies questions', () => {
+    assert.strictEqual(classifyClaim('Does the patient smoke?'), 'question')
+  })
+
+  it('identifies inferences', () => {
+    assert.strictEqual(classifyClaim('I think this might be migraine.'), 'inference')
+  })
+})
@@ -0,0 +1,3 @@
+export type { Claim, ClaimKind, Evidence, Verdict, VerificationResult, VerificationSummary, VerificationOptions } from './types'
+export { verifyNote } from './note-verifier'
+export { tokenize, extractNumbers, calculateOverlap, classifyClaim } from './verifier'
@@ -0,0 +1,73 @@
+import type { Claim, Evidence, VerificationResult, VerificationSummary, VerificationOptions } from './types'
+import { looksSupported, classifyClaim, determineVerdict } from './verifier'
+
+function extractClaims(text: string): string[] {
+  return text.replace(/\n+/g, ' ').split(/(?<=[.!?])\s+/).map(s => s.trim()).filter(s => s.length > 10)
+}
+
+function chunkTranscript(transcript: string): { text: string; ref: string }[] {
+  return transcript.split('\n').filter(l => l.trim()).map((text, i) => ({ text: text.trim(), ref: `line:${i + 1}` }))
+}
+
+function findEvidence(claim: string, chunks: { text: string; ref: string }[], opts: VerificationOptions): { evidence: Evidence[]; bestScore: number } {
+  const evidence: Evidence[] = []
+  let bestScore = 0
+  for (const chunk of chunks) {
+    const [, score] = looksSupported(claim, chunk.text, opts.minTokenOverlap, opts.minNumberCoverage)
+    if (score > 0.1) {
+      evidence.push({ ref: chunk.ref, text: chunk.text, score })
+      if (score > bestScore) bestScore = score
+    }
+  }
+  return { evidence: evidence.sort((a, b) => b.score - a.score).slice(0, 3), bestScore }
+}
+
+function calculateSummary(claims: Claim[]): VerificationSummary {
+  const facts = claims.filter(c => c.kind === 'fact')
+  const supported = facts.filter(c => c.verdict === 'supported').length
+  const unsupported = facts.filter(c => c.verdict === 'unsupported').length
+  const totalConf = facts.reduce((sum, c) => sum + c.confidence, 0)
+  return {
+    totalClaims: claims.length,
+    supportedClaims: supported,
+    unsupportedClaims: unsupported,
+    overallConfidence: facts.length > 0 ? Math.round((totalConf / facts.length) * 100) / 100 : 1.0
+  }
+}
+
+export async function verifyNote(noteText: string, transcript: string, options: VerificationOptions = {}): Promise<VerificationResult> {
+  const startTime = performance.now()
+  const { minTokenOverlap = 0.25, minNumberCoverage = 1.0, factsOnly = false } = options
+
+  const claimTexts = extractClaims(noteText)
+  const chunks = chunkTranscript(transcript)
+  const claims: Claim[] = []
+
+  for (let i = 0; i < claimTexts.length; i++) {
+    const text = claimTexts[i]
+    const kind = classifyClaim(text)
+    if (factsOnly && kind !== 'fact') continue
+
+    const { evidence, bestScore } = findEvidence(text, chunks, { minTokenOverlap, minNumberCoverage })
+    claims.push({
+      id: `claim_${i + 1}`,
+      text,
+      kind,
+      verdict: determineVerdict(bestScore, kind),
+      confidence: Math.round(bestScore * 100) / 100,
+      evidence
+    })
+  }
+
+  const summary = calculateSummary(claims)
+  const factTotal = summary.supportedClaims + summary.unsupportedClaims
+  let status: 'verified' | 'partial' | 'failed' = 'verified'
+  if (factTotal > 0) {
+    const supportRate = summary.supportedClaims / factTotal
+    const unsupportRate = summary.unsupportedClaims / factTotal
+    if (unsupportRate > 0.3) status = 'failed'
+    else if (supportRate < 0.8 || summary.unsupportedClaims > 0) status = 'partial'
+  }
+
+  return { status, summary, claims, processingTimeMs: Math.round(performance.now() - startTime) }
+}
@@ -0,0 +1,39 @@
+// types for note verification
+
+export type ClaimKind = 'fact' | 'inference' | 'opinion' | 'instruction' | 'question'
+export type Verdict = 'supported' | 'uncertain' | 'unsupported'
+
+export interface Claim {
+  id: string
+  text: string
+  kind: ClaimKind
+  verdict: Verdict
+  confidence: number
+  evidence: Evidence[]
+}
+
+export interface Evidence {
+  ref: string
+  text: string
+  score: number
+}
+
+export interface VerificationResult {
+  status: 'verified' | 'partial' | 'failed'
+  summary: VerificationSummary
+  claims: Claim[]
+  processingTimeMs: number
+}
+
+export interface VerificationSummary {
+  totalClaims: number
+  supportedClaims: number
+  unsupportedClaims: number
+  overallConfidence: number
+}
+
+export interface VerificationOptions {
+  minTokenOverlap?: number
+  minNumberCoverage?: number
+  factsOnly?: boolean
+}
@@ -0,0 +1,65 @@
+import type { ClaimKind, Verdict } from './types'
+
+const STOP_WORDS = new Set([
+  'a', 'an', 'the', 'and', 'or', 'but', 'if', 'then', 'of', 'to', 'in', 'on', 'for', 'with', 'by', 'as',
+  'is', 'are', 'was', 'were', 'be', 'been', 'it', 'this', 'that', 'at', 'from', 'not', 'can', 'do', 'does',
+  'we', 'you', 'they', 'i', 'he', 'she', 'has', 'have', 'had', 'will', 'patient', 'reports', 'denies'
+])
+
+export function tokenize(text: string): string[] {
+  const normalized = (text || '').toLowerCase().replace(/[^\w\-]+/g, ' ').trim()
+  if (!normalized) return []
+  return normalized.split(/\s+/).filter(t => t.length >= 2 && !STOP_WORDS.has(t))
+}
+
+export function extractNumbers(text: string): string[] {
+  return (text || '').match(/(?<![\w])\d+(?:[.,]\d+)?(?![\w])/g) || []
+}
+
+// returns overlap ratio 0-1
+export function calculateOverlap(claim: string, evidence: string): number {
+  const claimTokens = new Set(tokenize(claim))
+  const evidenceTokens = new Set(tokenize(evidence))
+  if (claimTokens.size === 0 || evidenceTokens.size === 0) return 0
+
+  let overlap = 0
+  for (const token of claimTokens) {
+    if (evidenceTokens.has(token)) overlap++
+  }
+  return overlap / claimTokens.size
+}
+
+function numberCoverage(claim: string, evidence: string): number {
+  const claimNums = extractNumbers(claim).map(n => n.replace(',', '.'))
+  if (claimNums.length === 0) return 1.0
+
+  const evidenceNums = new Set(extractNumbers(evidence).map(n => n.replace(',', '.')))
+  if (evidenceNums.size === 0) return 0
+
+  let hits = 0
+  for (const n of claimNums) if (evidenceNums.has(n)) hits++
+  return hits / claimNums.length
+}
+
+export function looksSupported(claim: string, evidence: string, minOverlap = 0.25, minNumCov = 1.0): [boolean, number] {
+  const overlap = calculateOverlap(claim, evidence)
+  const numCov = numberCoverage(claim, evidence)
+  const score = overlap * 0.7 + numCov * 0.3
+  return [overlap >= minOverlap && numCov >= minNumCov, score]
+}
+
+export function classifyClaim(text: string): ClaimKind {
+  const lower = text.toLowerCase().trim()
+  if (lower.endsWith('?')) return 'question'
+  if (['i think', 'i believe', 'probably', 'likely'].some(p => lower.includes(p))) return 'inference'
+  if (['in my opinion', 'i feel'].some(p => lower.includes(p))) return 'opinion'
+  if (['do ', 'please ', 'recommend ', 'consider '].some(p => lower.startsWith(p))) return 'instruction'
+  return 'fact'
+}
+
+export function determineVerdict(score: number, kind: ClaimKind): Verdict {
+  if (kind !== 'fact') return 'uncertain'
+  if (score >= 0.5) return 'supported'
+  if (score >= 0.25) return 'uncertain'
+  return 'unsupported'
+}