80 lines
2.4 KiB
Java
Raw Normal View History

2025-03-17 10:46:29 +08:00
package com.diagnose.util;
import java.util.*;
public class StringMatcher {
private final int GRAM_SIZE = 1;
private Map<String, Set<Integer>> indexMap = new HashMap<>();
private List<String> dataList;
public StringMatcher(List<String> dataList) {
this.dataList = dataList;
buildIndex();
}
private void buildIndex() {
// 为了平衡性能和内存可以考虑只索引n-gram
for (int i = 0; i < dataList.size(); i++) {
String s = dataList.get(i);
// 生成所有可能的子串或n-gram
for (int j = 0; j < s.length() - GRAM_SIZE + 1; j++) {
String gram = s.substring(j, j + GRAM_SIZE); // n-gram
indexMap.computeIfAbsent(gram, k -> new HashSet<>()).add(i);
}
}
}
public List<String> search(String query, int limit) {
// 如果查询字符串长度小于GRAM_SIZE使用简单遍历
if (query.length() < GRAM_SIZE) {
return simpleSearch(query, limit);
}
Set<Integer> candidates = null;
// 使用查询的n-gram找候选集
for (int i = 0; i <= query.length() - GRAM_SIZE; i++) {
String gram = query.substring(i, i + GRAM_SIZE);
Set<Integer> indices = indexMap.getOrDefault(gram, Collections.emptySet());
if (candidates == null) {
candidates = new HashSet<>(indices);
} else {
candidates.retainAll(indices);
}
if (candidates.isEmpty()) {
return Collections.emptyList();
}
}
// 验证候选集
List<String> result = new ArrayList<>();
for (Integer idx : candidates) {
if (dataList.get(idx).contains(query)) {
result.add(dataList.get(idx));
if (result.size() >= limit) {
break;
}
}
}
return result;
}
private List<String> simpleSearch(String query, int limit) {
// 简单遍历法
List<String> result = new ArrayList<>();
for (String s : dataList) {
if (s.contains(query)) {
result.add(s);
if (result.size() >= limit) {
break;
}
}
}
return result;
}
}