2025-03-17 10:46:29 +08:00

80 lines
2.4 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package com.diagnose.util;
import java.util.*;
public class StringMatcher {
private final int GRAM_SIZE = 1;
private Map<String, Set<Integer>> indexMap = new HashMap<>();
private List<String> dataList;
public StringMatcher(List<String> dataList) {
this.dataList = dataList;
buildIndex();
}
private void buildIndex() {
// 为了平衡性能和内存可以考虑只索引n-gram
for (int i = 0; i < dataList.size(); i++) {
String s = dataList.get(i);
// 生成所有可能的子串或n-gram
for (int j = 0; j < s.length() - GRAM_SIZE + 1; j++) {
String gram = s.substring(j, j + GRAM_SIZE); // n-gram
indexMap.computeIfAbsent(gram, k -> new HashSet<>()).add(i);
}
}
}
public List<String> search(String query, int limit) {
// 如果查询字符串长度小于GRAM_SIZE使用简单遍历
if (query.length() < GRAM_SIZE) {
return simpleSearch(query, limit);
}
Set<Integer> candidates = null;
// 使用查询的n-gram找候选集
for (int i = 0; i <= query.length() - GRAM_SIZE; i++) {
String gram = query.substring(i, i + GRAM_SIZE);
Set<Integer> indices = indexMap.getOrDefault(gram, Collections.emptySet());
if (candidates == null) {
candidates = new HashSet<>(indices);
} else {
candidates.retainAll(indices);
}
if (candidates.isEmpty()) {
return Collections.emptyList();
}
}
// 验证候选集
List<String> result = new ArrayList<>();
for (Integer idx : candidates) {
if (dataList.get(idx).contains(query)) {
result.add(dataList.get(idx));
if (result.size() >= limit) {
break;
}
}
}
return result;
}
private List<String> simpleSearch(String query, int limit) {
// 简单遍历法
List<String> result = new ArrayList<>();
for (String s : dataList) {
if (s.contains(query)) {
result.add(s);
if (result.size() >= limit) {
break;
}
}
}
return result;
}
}