80 lines
2.4 KiB
Java
80 lines
2.4 KiB
Java
package com.diagnose.util;
|
||
|
||
import java.util.*;
|
||
|
||
public class StringMatcher {
|
||
|
||
private final int GRAM_SIZE = 1;
|
||
|
||
private Map<String, Set<Integer>> indexMap = new HashMap<>();
|
||
private List<String> dataList;
|
||
|
||
public StringMatcher(List<String> dataList) {
|
||
this.dataList = dataList;
|
||
buildIndex();
|
||
}
|
||
|
||
private void buildIndex() {
|
||
// 为了平衡性能和内存,可以考虑只索引n-gram
|
||
for (int i = 0; i < dataList.size(); i++) {
|
||
String s = dataList.get(i);
|
||
// 生成所有可能的子串或n-gram
|
||
for (int j = 0; j < s.length() - GRAM_SIZE + 1; j++) {
|
||
String gram = s.substring(j, j + GRAM_SIZE); // n-gram
|
||
indexMap.computeIfAbsent(gram, k -> new HashSet<>()).add(i);
|
||
}
|
||
}
|
||
}
|
||
|
||
public List<String> search(String query, int limit) {
|
||
// 如果查询字符串长度小于GRAM_SIZE,使用简单遍历
|
||
if (query.length() < GRAM_SIZE) {
|
||
return simpleSearch(query, limit);
|
||
}
|
||
|
||
Set<Integer> candidates = null;
|
||
|
||
// 使用查询的n-gram找候选集
|
||
for (int i = 0; i <= query.length() - GRAM_SIZE; i++) {
|
||
String gram = query.substring(i, i + GRAM_SIZE);
|
||
Set<Integer> indices = indexMap.getOrDefault(gram, Collections.emptySet());
|
||
|
||
if (candidates == null) {
|
||
candidates = new HashSet<>(indices);
|
||
} else {
|
||
candidates.retainAll(indices);
|
||
}
|
||
|
||
if (candidates.isEmpty()) {
|
||
return Collections.emptyList();
|
||
}
|
||
}
|
||
|
||
// 验证候选集
|
||
List<String> result = new ArrayList<>();
|
||
for (Integer idx : candidates) {
|
||
if (dataList.get(idx).contains(query)) {
|
||
result.add(dataList.get(idx));
|
||
if (result.size() >= limit) {
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
private List<String> simpleSearch(String query, int limit) {
|
||
// 简单遍历法
|
||
List<String> result = new ArrayList<>();
|
||
for (String s : dataList) {
|
||
if (s.contains(query)) {
|
||
result.add(s);
|
||
if (result.size() >= limit) {
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
return result;
|
||
}
|
||
} |