前端可观测性体系建设从性能指标采集到告警闭环的全链路监控实战一、线上白屏 10 分钟才知道——前端监控的缺失之痛用户反馈页面白屏客服记录工单运维排查后端日志正常前端同学才被拉进群。从问题发生到前端介入已经过了 10 分钟。更关键的是白屏影响了多少用户哪些页面什么机型一问三不知。前端监控的典型缺失无性能基线不知道 FCP/LCP/INP 的正常值是多少优化无从下手无错误追踪JS 报错只看 console没有聚合分析同一错误反复出现不知道无用户影响面不知道错误影响了多少用户、哪些页面、什么浏览器无告警闭环错误发生了没人知道知道了没有告警告警了没有跟进前端可观测性三支柱指标Metrics看趋势、日志Logs查细节、链路Traces追全链路。三者联动才能从出问题了到定位到根因。二、前端可观测性的架构与数据流graph TB subgraph 采集层浏览器端 A[Web Vitals 采集器] B[JS 错误捕获] C[API 请求监控] D[用户行为追踪] end subgraph 传输层 E[批量上报 压缩] F[Beacon API / fetch] G[采样策略] end subgraph 存储与计算层 H[ClickHouse: 指标存储] I[Elasticsearch: 日志存储] J[Jaeger: 链路存储] end subgraph 告警与可视化层 K[Grafana: 仪表盘] L[告警规则引擎] M[PagerDuty/飞书: 通知] end A B C D -- E E -- F -- G G -- H I J H I -- K K -- L -- M核心指标体系Core Web Vitals 业务指标指标含义采集方式告警阈值LCP最大内容绘制时间PerformanceObserver 2.5sINP交互延迟PerformanceObserver 200msCLS累积布局偏移PerformanceObserver 0.1FCP首次内容绘制PerformanceObserver 1.8sAPI 错误率接口请求失败比例fetch/XHR 拦截 1%JS 错误率页面 JS 异常比例window.onerror 0.5%三、生产级前端监控系统实现3.1 Web Vitals 采集器interface MetricReport { name: string; // 指标名LCP / INP / CLS / FCP value: number; // 指标值毫秒或比值 rating: good | needs-improvement | poor; delta: number; // 与上次的变化量 navigationType: string; // 导航类型 url: string; // 页面 URL timestamp: number; // 采集时间戳 sessionId: string; // 会话 ID关联同一用户的行为 } class WebVitalsCollector { /**Web Vitals 采集器基于 PerformanceObserver 采集核心性能指标*/ private reportQueue: MetricReport[] []; private flushTimer: ReturnTypetypeof setTimeout | null null; private readonly FLUSH_INTERVAL 5000; // 5 秒批量上报一次 private readonly MAX_QUEUE_SIZE 20; // 队列最大长度 constructor(private reportEndpoint: string, private sessionId: string) { this.initObservers(); } private initObservers(): void { // LCP最大内容绘制衡量加载性能 this.observeLCP(); // INP交互延迟衡量交互响应性 this.observeINP(); // CLS累积布局偏移衡量视觉稳定性 this.observeCLS(); // FCP首次内容绘制 this.observeFCP(); } private observeLCP(): void { const observer new PerformanceObserver((entryList) { const entries entryList.getEntries(); const lastEntry entries[entries.length - 1]; const value lastEntry.startTime; this.enqueueReport({ name: LCP, value, rating: this.rateLCP(value), delta: value, navigationType: this.getNavigationType(), url: location.href, timestamp: Date.now(), sessionId: this.sessionId, }); }); observer.observe({ type: largest-contentful-paint, buffered: true }); } private observeINP(): void { let maxDuration 0; const observer new PerformanceObserver((entryList) { for (const entry of entryList.getEntries()) { // INP 取交互延迟的最差值 if (!isInteractionEntry(entry)) continue; const duration entry.duration; if (duration maxDuration) { maxDuration duration; } } }); observer.observe({ type: event, buffered: true }); // 页面隐藏时上报 INP document.addEventListener(visibilitychange, () { if (document.visibilityState hidden maxDuration 0) { this.enqueueReport({ name: INP, value: maxDuration, rating: this.rateINP(maxDuration), delta: maxDuration, navigationType: this.getNavigationType(), url: location.href, timestamp: Date.now(), sessionId: this.sessionId, }); } }); } private observeCLS(): void { let clsValue 0; let sessionValue 0; let sessionEntries: PerformanceEntry[] []; const observer new PerformanceObserver((entryList) { for (const entry of entryList.getEntries()) { if (!isLayoutShiftEntry(entry)) continue; // 只统计非用户操作引起的布局偏移 if ((entry as LayoutShift).hadRecentInput) continue; const firstSessionEntry sessionEntries[0]; const lastSessionEntry sessionEntries[sessionEntries.length - 1]; // 会话窗口如果距离上次偏移超过 1 秒或总窗口超过 5 秒开启新会话 if ( sessionValue (entry.startTime - lastSessionEntry.startTime 1000 || entry.startTime - firstSessionEntry.startTime 5000) ) { sessionValue 0; sessionEntries []; } sessionValue (entry as LayoutShift).value; sessionEntries.push(entry); clsValue Math.max(clsValue, sessionValue); } }); observer.observe({ type: layout-shift, buffered: true }); document.addEventListener(visibilitychange, () { if (document.visibilityState hidden clsValue 0) { this.enqueueReport({ name: CLS, value: clsValue, rating: this.rateCLS(clsValue), delta: clsValue, navigationType: this.getNavigationType(), url: location.href, timestamp: Date.now(), sessionId: this.sessionId, }); } }); } private observeFCP(): void { const observer new PerformanceObserver((entryList) { for (const entry of entryList.getEntries()) { if (entry.name first-contentful-paint) { const value entry.startTime; this.enqueueReport({ name: FCP, value, rating: this.rateFCP(value), delta: value, navigationType: this.getNavigationType(), url: location.href, timestamp: Date.now(), sessionId: this.sessionId, }); } } }); observer.observe({ type: paint, buffered: true }); } // 评分函数根据 Google 标准划分 good/needs-improvement/poor private rateLCP(v: number): MetricReport[rating] { return v 2500 ? good : v 4000 ? needs-improvement : poor; } private rateINP(v: number): MetricReport[rating] { return v 200 ? good : v 500 ? needs-improvement : poor; } private rateCLS(v: number): MetricReport[rating] { return v 0.1 ? good : v 0.25 ? needs-improvement : poor; } private rateFCP(v: number): MetricReport[rating] { return v 1800 ? good : v 3000 ? needs-improvement : poor; } private enqueueReport(report: MetricReport): void { this.reportQueue.push(report); // 队列满或页面隐藏时立即上报 if (this.reportQueue.length this.MAX_QUEUE_SIZE) { this.flush(); } else if (!this.flushTimer) { this.flushTimer setTimeout(() this.flush(), this.FLUSH_INTERVAL); } } private flush(): void { if (this.flushTimer) { clearTimeout(this.flushTimer); this.flushTimer null; } if (this.reportQueue.length 0) return; const batch [...this.reportQueue]; this.reportQueue []; // 使用 Beacon API 确保页面卸载时也能上报 const payload JSON.stringify(batch); if (navigator.sendBeacon) { navigator.sendBeacon(this.reportEndpoint, payload); } else { fetch(this.reportEndpoint, { method: POST, body: payload, keepalive: true, }).catch(() { // 上报失败静默处理不影响用户体验 }); } } private getNavigationType(): string { const entries performance.getEntriesByType(navigation); if (entries.length 0) { return (entries[0] as PerformanceNavigationTiming).type; } return unknown; } } // 类型守卫 function isInteractionEntry(entry: PerformanceEntry): boolean { return entry.entryType event; } function isLayoutShiftEntry(entry: PerformanceEntry): boolean { return entry.entryType layout-shift; } interface LayoutShift extends PerformanceEntry { value: number; hadRecentInput: boolean; }3.2 JS 错误与 API 请求监控interface ErrorReport { type: js_error | promise_rejection | resource_error; message: string; stack?: string; filename?: string; lineno?: number; colno?: number; url: string; timestamp: number; sessionId: string; userAgent: string; } class ErrorMonitor { /**错误监控器捕获 JS 运行时错误、未处理的 Promise 拒绝、资源加载失败*/ private reportQueue: ErrorReport[] []; // 错误去重同一错误 10 秒内只上报一次 private recentErrors new Mapstring, number(); private readonly DEDUP_WINDOW 10000; constructor( private reportEndpoint: string, private sessionId: string, ) { this.initCapture(); } private initCapture(): void { // 捕获 JS 运行时错误 window.addEventListener(error, (event) { this.captureError({ type: js_error, message: event.message, stack: event.error?.stack, filename: event.filename, lineno: event.lineno, colno: event.colno, }); }, true); // 捕获未处理的 Promise 拒绝 window.addEventListener(unhandledrejection, (event) { const reason event.reason; this.captureError({ type: promise_rejection, message: reason instanceof Error ? reason.message : String(reason), stack: reason instanceof Error ? reason.stack : undefined, }); }); // 捕获资源加载失败img/script/link window.addEventListener(error, (event) { const target event.target as HTMLElement; if (target target.tagName) { this.captureError({ type: resource_error, message: 资源加载失败: ${target.tagName} ${target.getAttribute(src) || target.getAttribute(href)}, }); } }, true); } private captureError(partial: OmitErrorReport, url | timestamp | sessionId | userAgent): void { // 错误去重 const dedupKey ${partial.type}:${partial.message}; const now Date.now(); const lastTime this.recentErrors.get(dedupKey); if (lastTime now - lastTime this.DEDUP_WINDOW) { return; } this.recentErrors.set(dedupKey, now); const report: ErrorReport { ...partial, url: location.href, timestamp: now, sessionId: this.sessionId, userAgent: navigator.userAgent, }; this.reportQueue.push(report); this.flushIfNeeded(); } private flushIfNeeded(): void { if (this.reportQueue.length 10) { this.flush(); } } private flush(): void { const batch [...this.reportQueue]; this.reportQueue []; const payload JSON.stringify(batch); if (navigator.sendBeacon) { navigator.sendBeacon(this.reportEndpoint, payload); } else { fetch(this.reportEndpoint, { method: POST, body: payload, keepalive: true, }).catch(() {}); } } }3.3 API 请求监控与全链路 TraceId 传递interface APIReport { url: string; method: string; status: number; duration: number; traceId?: string; // 后端返回的链路追踪 ID error?: string; timestamp: number; sessionId: string; } class APIMonitor { /**API 请求监控拦截 fetch 请求采集延迟、状态码、TraceId*/ private originalFetch: typeof window.fetch; constructor( private reportEndpoint: string, private sessionId: string, ) { this.originalFetch window.fetch.bind(window); this.interceptFetch(); } private interceptFetch(): void { const self this; window.fetch async function (input: RequestInfo | URL, init?: RequestInit): PromiseResponse { const startTime performance.now(); const url typeof input string ? input : input instanceof URL ? input.href : input.url; const method init?.method || GET; try { const response await self.originalFetch(input, init); const duration performance.now() - startTime; // 提取后端返回的 TraceId用于前后端链路关联 const traceId response.headers.get(x-trace-id) || undefined; self.reportAPI({ url, method, status: response.status, duration, traceId, timestamp: Date.now(), sessionId: self.sessionId, }); return response; } catch (error) { const duration performance.now() - startTime; self.reportAPI({ url, method, status: 0, duration, error: error instanceof Error ? error.message : String(error), timestamp: Date.now(), sessionId: self.sessionId, }); throw error; } }; } private reportAPI(report: APIReport): void { const payload JSON.stringify([report]); if (navigator.sendBeacon) { navigator.sendBeacon(this.reportEndpoint, payload); } } }四、前端可观测性的架构权衡采集量 vs 上报成本的矛盾全量采集数据完整但上报量大服务器成本高可能影响页面性能采样上报成本低但可能漏掉关键错误。建议错误全量上报性能指标采样上报采样率 10-20%批量上报 vs 实时上报批量减少请求数但延迟增加。5 秒批量是合理折中PerformanceObserver 的兼容性INP 指标需要 Chrome 96Safari 和 Firefox 支持不完整buffered: true选项在部分旧浏览器不可用需要降级为performance.getEntriesByNameBeacon API 在部分浏览器有数据大小限制64KB大批量数据需要拆分TraceId 传递的局限前后端链路关联依赖后端在响应头中返回x-trace-id需要后端配合改造第三方 API 请求无法获取 TraceId链路断裂用户行为追踪点击→请求→渲染需要额外的 span 关联逻辑禁用场景内网管理系统用户量小、页面简单投入产出比低对安全性要求极高的页面监控 SDK 可能成为攻击面需要严格审计微信小程序等受限环境PerformanceObserver 和 Beacon API 不可用需要平台专属 API五、总结前端可观测性体系的三支柱Web Vitals 采集器监控性能指标LCP/INP/CLS/FCPErrorMonitor 捕获 JS 错误和资源加载失败APIMonitor 拦截 fetch 请求采集延迟和状态码。三者共享批量上报和 Beacon API 传输机制确保页面卸载时数据不丢失。采集策略上错误全量上报、性能指标采样上报是成本与完整性的合理折中。前后端链路关联通过 TraceId 传递实现但依赖后端配合改造。前端可观测性适用于用户量大、页面复杂的产品内网管理系统和受限环境可酌情简化。