关于计算两个文章的克拉默系数
最近项目中遇到一个需求通过计算两个csv文件的克拉默系数来判断文件的相似度是否存在抄袭。简单说下我们业务背景我们的是竞赛平台参数选手通过分析大量数据生成一个csv文件第一列为uuid第二列为目标答案根据组织的比赛不同第二列的取值范围不同有的时候可能是0/1有的时候可能是1//2/3/4有的时候可能还是Y/N。代码如下package com.lsl.cramer; import java.util.Map; import java.util.Random; import java.util.Set; import java.util.UUID; /** * 封装选手提交的竞赛答案CSV数据适配2列固定结构 */ public class ContestAnswerData { // 选手名称/文件标识用于结果输出 private final String playerIdentifier; // uuid列名业务上的目标id private final String uuidColumnName; // 答案列名 private final String answerColumnName; // 核心数据keyuuid/目标idvalue选手答案值 private final MapString, String answerMap; // 该选手答案的所有唯一取值用于同竞赛格式校验 private final SetString answerValueSet; public ContestAnswerData(String playerIdentifier, String uuidColumnName, String answerColumnName, MapString, String answerMap, SetString answerValueSet) { this.playerIdentifier playerIdentifier; this.uuidColumnName uuidColumnName; this.answerColumnName answerColumnName; this.answerMap answerMap; this.answerValueSet answerValueSet; } // Getter方法 public String getPlayerIdentifier() { return playerIdentifier; } public String getUuidColumnName() { return uuidColumnName; } public String getAnswerColumnName() { return answerColumnName; } public MapString, String getAnswerMap() { return answerMap; } public SetString getAnswerValueSet() { return answerValueSet; } // 获取有效答案数量 public int getValidAnswerCount() { return answerMap.size(); } public static void main(String[] args) { Random random new Random(); for(int i 0; i100;i) { System.out.println(UUID.randomUUID().toString() , random.nextInt(5)); } } }package com.lsl.cramer; /** * 竞赛答案CSV格式校验异常适配业务场景错误处理 */ public class ContestAnswerCsvException extends RuntimeException { public ContestAnswerCsvException(String message) { super(message); } public ContestAnswerCsvException(String message, Throwable cause) { super(message, cause); } }package com.lsl.cramer; import com.opencsv.CSVReader; import com.opencsv.exceptions.CsvException; import java.io.FileReader; import java.io.IOException; import java.util.*; /** * 竞赛选手答案克拉默系数计算核心工具类 * 适配2列CSV结构兼容任意答案取值范围用于反抄袭相似度对比 */ public class ContestAnswerCramerCalculator { // 可配置项根据业务需求调整 // uuid列名适配业务中的目标id等别名 private static final String UUID_COLUMN_NAME uuid; // 答案列名可根据实际CSV列名调整 private static final String ANSWER_COLUMN_NAME answer; // 抄袭风险阈值克拉默系数超过该值标记为高风险抄袭 private static final double PLAGIARISM_RISK_THRESHOLD 0.9; // 卡方检验理论频数最低阈值低于该值会输出警告影响结果可靠性 private static final double MIN_EXPECTED_FREQUENCY 1.0; // 缺失值处理策略 public enum MissingValueStrategy { EXCLUDE_MISSING, // 剔除仅单方存在的uuid推荐同竞赛答案应覆盖同一批目标id MISSING_AS_NULL // 将缺失值作为NULL独立答案类别保留所有uuid } // /** * 读取选手提交的CSV答案文件封装为统一数据结构 * param filePath CSV文件路径 * param playerIdentifier 选手标识如选手A、北京分行-选手1 * return 封装后的选手答案数据 */ public ContestAnswerData readAnswerCsv(String filePath, String playerIdentifier) { try (CSVReader reader new CSVReader(new FileReader(filePath))) { ListString[] allRows reader.readAll(); // 1. 基础格式校验 if (allRows.isEmpty()) { throw new ContestAnswerCsvException(CSV文件为空 filePath); } if (allRows.get(0).length ! 2) { throw new ContestAnswerCsvException(CSV文件列数必须为2当前列数 allRows.get(0).length 文件 filePath); } // 2. 表头校验 // String[] header allRows.get(0); // String uuidColumn header[0].trim(); // String answerColumn header[1].trim(); // if (!uuidColumn.equalsIgnoreCase(UUID_COLUMN_NAME)) { // throw new ContestAnswerCsvException(CSV首列必须为uuid/目标id列当前首列 uuidColumn 文件 filePath); // } // if (!answerColumn.equalsIgnoreCase(ANSWER_COLUMN_NAME)) { // throw new ContestAnswerCsvException(CSV第二列必须为答案列当前第二列 answerColumn 文件 filePath); // } // 3. 读取数据行处理重复uuid、空值 MapString, String answerMap new LinkedHashMap(); for (int i 1; i allRows.size(); i) { String[] row allRows.get(i); if (row.length ! 2) { throw new ContestAnswerCsvException(CSV第 (i1) 行列数与表头不一致文件 filePath); } String uuid row[0].trim(); String answer row[1].trim(); // 跳过uuid为空的无效行 if (uuid.isEmpty()) { continue; } // 空答案统一处理为NULL避免统计失真 if (answer.isEmpty()) { answer NULL; } // 按uuid去重保留最后一条有效数据 answerMap.put(uuid, answer); } // 4. 校验有效数据 if (answerMap.isEmpty()) { throw new ContestAnswerCsvException(CSV文件无有效答案数据文件 filePath); } // 5. 提取答案唯一取值集合用于同竞赛格式校验 SetString answerValueSet new HashSet(answerMap.values()); return new ContestAnswerData(playerIdentifier, uuidColumn, answerColumn, answerMap, answerValueSet); } catch (IOException | CsvException e) { throw new ContestAnswerCsvException(读取CSV答案文件失败 filePath, e); } } /** * 校验两个选手的答案是否符合同一竞赛的格式要求 * 核心校验答案取值范围一致同竞赛要求 * param data1 选手1的答案数据 * param data2 选手2的答案数据 */ public void validateSameContestFormat(ContestAnswerData data1, ContestAnswerData data2) { // 校验列名一致 if (!data1.getUuidColumnName().equalsIgnoreCase(data2.getUuidColumnName())) { throw new ContestAnswerCsvException(两个选手的uuid列名不一致不属于同一竞赛); } if (!data1.getAnswerColumnName().equalsIgnoreCase(data2.getAnswerColumnName())) { throw new ContestAnswerCsvException(两个选手的答案列名不一致不属于同一竞赛); } // 校验答案取值范围一致同竞赛核心要求 SetString allAnswerValues new HashSet(); allAnswerValues.addAll(data1.getAnswerValueSet()); allAnswerValues.addAll(data2.getAnswerValueSet()); // 若两个选手的取值集合的并集与各自的集合差异超过阈值说明不属于同一竞赛 // 允许少量差异如个别选手的异常值严格场景可改为完全相等 if (allAnswerValues.size() Math.max(data1.getAnswerValueSet().size(), data2.getAnswerValueSet().size()) * 1.2) { throw new ContestAnswerCsvException(两个选手的答案取值范围差异过大不属于同一竞赛无法对比); } } /** * 对齐两个选手的答案数据按uuid匹配处理缺失值 * param data1 选手1的答案数据 * param data2 选手2的答案数据 * param strategy 缺失值处理策略 * return 对齐后的数据keyuuidvalue[选手1答案, 选手2答案] */ public MapString, ListString alignAnswerData(ContestAnswerData data1, ContestAnswerData data2, MissingValueStrategy strategy) { MapString, String answerMap1 data1.getAnswerMap(); MapString, String answerMap2 data2.getAnswerMap(); // 获取所有uuid的并集 SetString allUuids new HashSet(answerMap1.keySet()); allUuids.addAll(answerMap2.keySet()); MapString, ListString alignedData new LinkedHashMap(); final String missingValue NULL; for (String uuid : allUuids) { String answer1 answerMap1.getOrDefault(uuid, missingValue); String answer2 answerMap2.getOrDefault(uuid, missingValue); // 按策略处理缺失值 if (strategy MissingValueStrategy.EXCLUDE_MISSING) { // 剔除仅单方存在的uuid if (answer1.equals(missingValue) || answer2.equals(missingValue)) { continue; } } alignedData.put(uuid, Arrays.asList(answer1, answer2)); } // 校验对齐后是否有有效数据 if (alignedData.isEmpty()) { throw new ContestAnswerCsvException(数据对齐后无有效对比样本两个选手无共同的uuid/目标id); } return alignedData; } /** * 动态构建二维列联表统计交叉频数 * 适配任意答案取值范围无需提前配置 * param alignedData 对齐后的答案数据 * return 列联表封装数据包含频数矩阵、行/列类别、总样本数 */ public ContingencyTable buildContingencyTable(MapString, ListString alignedData) { // 提取所有答案取值动态确定行/列类别 SetString player1AnswerSet new LinkedHashSet(); SetString player2AnswerSet new LinkedHashSet(); for (ListString answerPair : alignedData.values()) { player1AnswerSet.add(answerPair.get(0)); player2AnswerSet.add(answerPair.get(1)); } // 转换为列表固定索引 ListString player1Categories new ArrayList(player1AnswerSet); ListString player2Categories new ArrayList(player2AnswerSet); int rowCount player1Categories.size(); int colCount player2Categories.size(); // 构建频数矩阵统计交叉出现次数 int[][] frequencyMatrix new int[rowCount][colCount]; for (ListString answerPair : alignedData.values()) { String answer1 answerPair.get(0); String answer2 answerPair.get(1); int rowIdx player1Categories.indexOf(answer1); int colIdx player2Categories.indexOf(answer2); frequencyMatrix[rowIdx][colIdx]; } return new ContingencyTable(frequencyMatrix, player1Categories, player2Categories, alignedData.size()); } /** * 计算皮尔逊卡方统计量基于列联表 * param table 列联表数据 * return 卡方值 */ public double calculatePearsonChiSquare(ContingencyTable table) { int[][] frequencyMatrix table.getFrequencyMatrix(); int rowCount frequencyMatrix.length; int colCount frequencyMatrix[0].length; int totalSample table.getTotalSample(); // 计算行合计 int[] rowSum new int[rowCount]; for (int i 0; i rowCount; i) { for (int j 0; j colCount; j) { rowSum[i] frequencyMatrix[i][j]; } } // 计算列合计 int[] colSum new int[colCount]; for (int j 0; j colCount; j) { for (int i 0; i rowCount; i) { colSum[j] frequencyMatrix[i][j]; } } // 计算卡方值同时校验理论频数 double chiSquare 0.0; boolean hasLowExpectedFrequency false; int lowFrequencyCount 0; int totalCellCount rowCount * colCount; for (int i 0; i rowCount; i) { for (int j 0; j colCount; j) { double expectedFrequency (double) (rowSum[i] * colSum[j]) / totalSample; // 统计理论频数低于阈值的单元格 if (expectedFrequency MIN_EXPECTED_FREQUENCY) { hasLowExpectedFrequency true; lowFrequencyCount; } // 卡方计算公式 chiSquare Math.pow(frequencyMatrix[i][j] - expectedFrequency, 2) / expectedFrequency; } } // 输出理论频数警告 if (hasLowExpectedFrequency) { double lowFrequencyRatio (double) lowFrequencyCount / totalCellCount; System.out.printf(⚠️ 警告列联表中%d个单元格理论频数低于%.1f占比%.1f%%卡方检验结果可靠性可能受影响%n, lowFrequencyCount, MIN_EXPECTED_FREQUENCY, lowFrequencyRatio * 100); } return chiSquare; } /** * 计算克拉默系数基于卡方值和列联表数据 * param chiSquare 皮尔逊卡方值 * param table 列联表数据 * return 克拉默系数取值范围[0,1] */ public double calculateCramerCoefficient(double chiSquare, ContingencyTable table) { int totalSample table.getTotalSample(); int categoryCount1 table.getPlayer1Categories().size(); int categoryCount2 table.getPlayer2Categories().size(); // 边界情况处理仅1个答案类别时系数无意义返回0 if (categoryCount1 1 || categoryCount2 1) { System.out.println(⚠️ 警告答案仅1个有效类别克拉默系数无统计意义返回0); return 0.0; } // 计算自由度最小值 int minDegreeOfFreedom Math.min(categoryCount1 - 1, categoryCount2 - 1); // 克拉默系数核心计算公式 return Math.sqrt(chiSquare / (totalSample * minDegreeOfFreedom)); } /** * 全流程执行读取两个选手的CSV计算克拉默系数输出反抄袭结果 * param filePath1 选手1的CSV文件路径 * param player1Identifier 选手1的标识 * param filePath2 选手2的CSV文件路径 * param player2Identifier 选手2的标识 * param strategy 缺失值处理策略 * return 最终计算结果包含克拉默系数、风险等级、详细统计数据 */ public CramerCalculationResult calculateFullProcess(String filePath1, String player1Identifier, String filePath2, String player2Identifier, MissingValueStrategy strategy) { // 1. 读取两个选手的答案CSV ContestAnswerData data1 readAnswerCsv(filePath1, player1Identifier); ContestAnswerData data2 readAnswerCsv(filePath2, player2Identifier); System.out.printf(✅ 成功读取两个选手的答案文件%n); System.out.printf( %s%d条有效答案取值范围%s%n, player1Identifier, data1.getValidAnswerCount(), data1.getAnswerValueSet()); System.out.printf( %s%d条有效答案取值范围%s%n, player2Identifier, data2.getValidAnswerCount(), data2.getAnswerValueSet()); // 2. 校验同竞赛格式一致性 validateSameContestFormat(data1, data2); System.out.println(✅ 两个选手的答案符合同一竞赛格式要求可正常对比); // 3. 对齐答案数据 MapString, ListString alignedData alignAnswerData(data1, data2, strategy); System.out.printf(✅ 数据对齐完成有效对比样本数%d%n, alignedData.size()); // 4. 构建列联表 ContingencyTable table buildContingencyTable(alignedData); System.out.printf(✅ 列联表构建完成%s答案类别数%d%s答案类别数%d%n, player1Identifier, table.getPlayer1Categories().size(), player2Identifier, table.getPlayer2Categories().size()); // 5. 计算皮尔逊卡方值 double chiSquare calculatePearsonChiSquare(table); System.out.printf(✅ 皮尔逊卡方值计算完成%.2f%n, chiSquare); // 6. 计算克拉默系数 double cramerCoefficient calculateCramerCoefficient(chiSquare, table); System.out.printf(✅ 克拉默系数计算完成%.4f%n, cramerCoefficient); // 7. 判定抄袭风险等级 String riskLevel cramerCoefficient PLAGIARISM_RISK_THRESHOLD ? 高风险抄袭 : 正常; if (高风险抄袭.equals(riskLevel)) { System.out.println( 检测到高风险抄袭行为请立即人工复核); } // 8. 封装最终结果 CramerCalculationResult result new CramerCalculationResult( data1, data2, alignedData.size(), table, chiSquare, cramerCoefficient, riskLevel, PLAGIARISM_RISK_THRESHOLD ); // 打印完整结果 printResult(result); return result; } /** * 打印完整的计算结果适配反抄袭业务场景 * param result 计算结果 */ private void printResult(CramerCalculationResult result) { System.out.println(\n 竞赛答案相似度对比结果 ); System.out.printf(对比选手1%s%n, result.getData1().getPlayerIdentifier()); System.out.printf(对比选手2%s%n, result.getData2().getPlayerIdentifier()); System.out.printf(有效对比样本数%d%n, result.getTotalSample()); System.out.printf(皮尔逊卡方值%.2f%n, result.getChiSquare()); System.out.printf(克拉默系数相似度%.4f%n, result.getCramerCoefficient()); System.out.printf(抄袭风险阈值≥%.2f%n, result.getRiskThreshold()); System.out.printf(最终风险评估%s%n, result.getRiskLevel()); System.out.println(\n答案交叉统计详情); System.out.println(------------------------------------------------------------); ContingencyTable table result.getTable(); ListString player1Categories table.getPlayer1Categories(); ListString player2Categories table.getPlayer2Categories(); int[][] frequencyMatrix table.getFrequencyMatrix(); // 打印表头 System.out.printf(%-15s, result.getData1().getPlayerIdentifier() \\ result.getData2().getPlayerIdentifier()); for (String category : player2Categories) { System.out.printf(%-10s, category); } System.out.println(行合计); System.out.println(------------------------------------------------------------); // 打印每行数据 for (int i 0; i player1Categories.size(); i) { System.out.printf(%-15s, player1Categories.get(i)); int rowSum 0; for (int j 0; j player2Categories.size(); j) { System.out.printf(%-10d, frequencyMatrix[i][j]); rowSum frequencyMatrix[i][j]; } System.out.println(rowSum); } // 打印列合计 System.out.println(------------------------------------------------------------); System.out.printf(%-15s, 列合计); for (int j 0; j player2Categories.size(); j) { int colSum 0; for (int i 0; i player1Categories.size(); i) { colSum frequencyMatrix[i][j]; } System.out.printf(%-10d, colSum); } System.out.println(result.getTotalSample()); System.out.println(); } // 内部封装类 /** * 列联表数据封装 */ public static class ContingencyTable { private final int[][] frequencyMatrix; private final ListString player1Categories; private final ListString player2Categories; private final int totalSample; public ContingencyTable(int[][] frequencyMatrix, ListString player1Categories, ListString player2Categories, int totalSample) { this.frequencyMatrix frequencyMatrix; this.player1Categories player1Categories; this.player2Categories player2Categories; this.totalSample totalSample; } public int[][] getFrequencyMatrix() { return frequencyMatrix; } public ListString getPlayer1Categories() { return player1Categories; } public ListString getPlayer2Categories() { return player2Categories; } public int getTotalSample() { return totalSample; } } /** * 最终计算结果封装 */ public static class CramerCalculationResult { private final ContestAnswerData data1; private final ContestAnswerData data2; private final int totalSample; private final ContingencyTable table; private final double chiSquare; private final double cramerCoefficient; private final String riskLevel; private final double riskThreshold; public CramerCalculationResult(ContestAnswerData data1, ContestAnswerData data2, int totalSample, ContingencyTable table, double chiSquare, double cramerCoefficient, String riskLevel, double riskThreshold) { this.data1 data1; this.data2 data2; this.totalSample totalSample; this.table table; this.chiSquare chiSquare; this.cramerCoefficient cramerCoefficient; this.riskLevel riskLevel; this.riskThreshold riskThreshold; } // Getter方法用于上层业务获取结果 public ContestAnswerData getData1() { return data1; } public ContestAnswerData getData2() { return data2; } public int getTotalSample() { return totalSample; } public ContingencyTable getTable() { return table; } public double getChiSquare() { return chiSquare; } public double getCramerCoefficient() { return cramerCoefficient; } public String getRiskLevel() { return riskLevel; } public double getRiskThreshold() { return riskThreshold; } } }package com.lsl.cramer; /** * 竞赛答案克拉默系数计算测试主类 * 演示如何快速使用工具类进行两个选手的答案对比 */ public class ContestAnswerCramerMain { public static void main(String[] args) { // 选手1的CSV文件路径和标识 String filePath1 D:\\temp\\aaa.csv; String player1Identifier 北京分行-选手A; // 选手2的CSV文件路径和标识 String filePath2 D:\\temp\\bbb.csv; String player2Identifier 北京分行-选手B; // 创建计算器实例 ContestAnswerCramerCalculator calculator new ContestAnswerCramerCalculator(); // 全流程执行计算使用推荐的缺失值处理策略剔除仅单方存在的uuid ContestAnswerCramerCalculator.CramerCalculationResult result calculator.calculateFullProcess( filePath1, player1Identifier, filePath2, player2Identifier, ContestAnswerCramerCalculator.MissingValueStrategy.EXCLUDE_MISSING ); // 上层业务可基于结果做后续处理比如入库、发送告警、人工复核通知等 if (高风险抄袭.equals(result.getRiskLevel())) { // 业务告警逻辑 System.out.println(\n 业务告警检测到高风险抄袭行为已触发人工复核流程); } } }