I have the following Java code that generates our indexing words:
@Entity
@Indexed(index = "process")
@Table(name = "process")
public class Process extends BaseTemplateBean {
// [... a lot of unrelated stuff ...]
@Transient
private transient IndexingKeyworder indexingKeyworder;
// [... a bigger lot of unrelated stuff ...]
@Transient
@FullTextField(name = "search")
@IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO)
public String getKeywordsForFreeSearch() {
return initializeKeywords().getSearch();
}
@Transient
@FullTextField(name = "searchTitle")
@IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO)
public String getKeywordsForSearchingInTitle() {
return initializeKeywords().getSearchTitle();
}
@Transient
@FullTextField(name = "searchProject")
@IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO)
public String getKeywordsForSearchingByProjectName() {
return initializeKeywords().getSearchProject();
}
@Transient
@FullTextField(name = "searchBatch")
@IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO)
public String getKeywordsForAssignmentToBatches() {
return initializeKeywords().getSearchBatch();
}
@Transient
@FullTextField(name = "searchTask")
@IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO)
public String getKeywordsForSearchingForTaskInformation() {
return initializeKeywords().getSearchTask();
}
private IndexingKeyworder initializeKeywords() {
if (this.indexingKeyworder == null) {
IndexingKeyworder indexingKeyworder = new IndexingKeyworder(this);
this.indexingKeyworder = indexingKeyworder;
return indexingKeyworder;
} else {
return indexingKeyworder;
}
}
}
These code use this helper class to work without duplicating the work in any case:
class IndexingKeyworder {
private static final String PSEUDOWORD_TASK_AUTOMATIC = "automatic";
private static final String PSEUDOWORD_TASK_DONE = "closed";
private static final String PSEUDOWORD_TASK_DONE_PROCESSING_USER = "closeduser";
private static final String ANY_METADATA_MARKER = "mdWrap";
private static final char VALUE_SEPARATOR = 'q';
private static final Pattern TITLE_GROUPS_PATTERN = Pattern.compile("[\\p{IsLetter}\\p{Digit}]+");
private static final Pattern METADATA_PATTERN = Pattern.compile("name=\"([^\"]+)\">([^<]*)<", Pattern.DOTALL);
private static final Pattern METADATA_SECTIONS_PATTERN = attern.compile("<mets:dmdSec.*?o>(.*?)</kitodo:k", Pattern.DOTALL);
private static final Pattern RULESET_KEY_PATTERN = Pattern.compile("key id=\"([^\"]+)\">(.*?)</key>", Pattern.DOTALL);
private static final Pattern RULESET_LABEL_PATTERN = Pattern.compile("<label[^>]*>([^<]+)", Pattern.DOTALL);
private final Map<String, Map<String, Collection<String>>> rulesetCache = new HashMap<>();
private Set<String> titleKeywords = Collections.emptySet();
private Set<String> projectKeywords = Collections.emptySet();
private Set<String> batchKeywords = Collections.emptySet();
private Set<String> taskKeywords = Collections.emptySet();
private Set<String> taskPseudoKeywords = Collections.emptySet();
private Set<String> metadataKeywords = Collections.emptySet();
private Set<String> metadataPseudoKeywords = Collections.emptySet();
private String processId = null;
private Set<String> commentKeywords = Collections.emptySet();
public IndexingKeyworder(Process process) {
this.titleKeywords = filterMinLength(initTitleKeywords(process.getTitle()));
this.projectKeywords = filterMinLength(initSimpleKeywords(Objects.nonNull(process.getProject()) ? process.getProject().getTitle() : ""));
this.batchKeywords = filterMinLength(initBatchKeywords(process.getBatches()));
var taskKeywords = initTaskKeywords(process.getTasksUnmodified());
this.taskKeywords = filterMinLength(taskKeywords.getLeft());
this.taskPseudoKeywords = filterMinLength(taskKeywords.getRight());
var metadataKeywords = initMetadataKeywords(process);
this.metadataKeywords = filterMinLength(metadataKeywords.getLeft());
this.metadataPseudoKeywords = filterMinLength(metadataKeywords.getRight());
this.processId = process.getId().toString();
this.commentKeywords = filterMinLength(initCommentKeywords(process.getComments()));
}
private static Set<String> initTitleKeywords(String processTitle) {
Set<String> tokens = new HashSet<>();
Matcher matcher = TITLE_GROUPS_PATTERN.matcher(processTitle);
while (matcher.find()) {
String normalized = normalize(matcher.group());
final int length = normalized.length();
for (int end = 1; end <= length; end++) {
tokens.add(normalized.substring(0, end));
}
for (int beginning = length - 1; beginning >= 0; beginning--) {
tokens.add(normalized.substring(beginning, length));
}
}
return tokens;
}
private static final Set<String> initSimpleKeywords(String input) {
Set<String> tokens = new HashSet<>();
for (String term : splitValues(input)) {
tokens.add(normalize(term));
}
return tokens;
}
private static final Set<String> initBatchKeywords(Collection<Batch> batches) {
if (batches.isEmpty()) {
return Collections.emptySet();
}
Set<String> tokens = new HashSet<>();
for (Batch batch : batches) {
String optionalTitle = batch.getTitle();
if (StringUtils.isNotBlank(optionalTitle)) {
tokens.addAll(initSimpleKeywords(optionalTitle));
}
}
return tokens;
}
private static final Pair<Set<String>, Set<String>> initTaskKeywords(Collection<Task> tasks) {
Set<String> taskKeywords = new HashSet<>();
Set<String> taskPseudoKeywords = new HashSet<>();
for (Task task : tasks) {
for (String token : splitValues(task.getTitle())) {
String term = normalize(token);
taskKeywords.add(term);
if (task.isTypeAutomatic()) {
taskKeywords.add(PSEUDOWORD_TASK_AUTOMATIC + VALUE_SEPARATOR + term);
}
TaskStatus taskStatus = task.getProcessingStatus();
if (Objects.isNull(taskStatus)) {
continue;
}
if (Objects.equals(taskStatus, TaskStatus.DONE)) {
taskPseudoKeywords.add(PSEUDOWORD_TASK_DONE);
taskPseudoKeywords.add(PSEUDOWORD_TASK_DONE + VALUE_SEPARATOR + term);
User closedUser = task.getProcessingUser();
if (Objects.isNull(closedUser)) {
continue;
}
if (StringUtils.isNotBlank(closedUser.getName())) {
taskPseudoKeywords.add(PSEUDOWORD_TASK_DONE_PROCESSING_USER + VALUE_SEPARATOR + normalize(
closedUser.getName()));
}
if (StringUtils.isNotBlank(closedUser.getSurname())) {
taskPseudoKeywords.add(PSEUDOWORD_TASK_DONE_PROCESSING_USER + VALUE_SEPARATOR + normalize(
closedUser.getSurname()));
}
} else {
String taskKeyword = taskStatus.toString().toLowerCase();
taskPseudoKeywords.add(taskKeyword);
taskPseudoKeywords.add(taskKeyword + VALUE_SEPARATOR + term);
}
}
}
return Pair.of(taskKeywords, taskPseudoKeywords);
}
private static final Pair<Set<String>, Set<String>> initMetadataKeywords(Process process) {
final Pair<Set<String>, Set<String>> emptyResult = Pair.of(Collections.emptySet(), Collections.emptySet());
try {
String processId = Integer.toString(process.getId());
Path path = Paths.get(KitodoConfig.getKitodoDataDirectory(), processId, "meta.xml");
if (!Files.isReadable(path)) {
return emptyResult;
}
String metaXml = FileUtils.readFileToString(path.toFile(), StandardCharsets.UTF_8);
if (!metaXml.contains(ANY_METADATA_MARKER)) {
return emptyResult;
}
Set<String> metadataKeywords = new HashSet<>();
Set<String> metadataPseudoKeywords = new HashSet<>();
Map<String, Collection<String>> rulesetLabelMap = getRulesetLabelMap(process.getRuleset().getFile());
Matcher metadataSectionsMatcher = METADATA_SECTIONS_PATTERN.matcher(metaXml);
while (metadataSectionsMatcher.find()) {
Matcher keyMatcher = METADATA_PATTERN.matcher(metadataSectionsMatcher.group(1));
while (keyMatcher.find()) {
String key = normalize(keyMatcher.group(1));
String valueString = keyMatcher.group(2);
for (String singleValue : splitValues(valueString)) {
String value = normalize(singleValue);
metadataKeywords.add(value);
metadataPseudoKeywords.add(key + VALUE_SEPARATOR + value);
metadataPseudoKeywords.add(key);
for (String label : rulesetLabelMap.getOrDefault(key, Collections.emptyList())) {
metadataPseudoKeywords.add(label + VALUE_SEPARATOR + value);
metadataPseudoKeywords.add(label);
}
}
}
}
return Pair.of(metadataKeywords, metadataPseudoKeywords);
} catch (IOException | RuntimeException e) {
return emptyResult;
}
}
private static Map<String, Collection<String>> getRulesetLabelMap(String file) {
Map<String, Collection<String>> rulesetLabelMap = rulesetCache.get(file);
if (Objects.nonNull(rulesetLabelMap)) {
return rulesetLabelMap;
}
try {
File rulesetFile = Paths.get(KitodoConfig.getParameter("directory.rulesets"), file).toFile();
String ruleset = FileUtils.readFileToString(rulesetFile, StandardCharsets.UTF_8);
rulesetLabelMap = new HashMap<>();
Matcher keysMatcher = RULESET_KEY_PATTERN.matcher(ruleset);
while (keysMatcher.find()) {
String key = normalize(keysMatcher.group(1));
Matcher labelMatcher = RULESET_LABEL_PATTERN.matcher(keysMatcher.group(2));
Set<String> labels = new HashSet<>();
while (labelMatcher.find()) {
labels.add(normalize(labelMatcher.group(1)));
}
rulesetLabelMap.put(key, labels);
}
rulesetCache.put(file, rulesetLabelMap);
return rulesetLabelMap;
} catch (IOException | RuntimeException e) {
return Collections.emptyMap();
}
}
private static final Set<String> initCommentKeywords(List<Comment> comments) {
Set<String> tokens = new HashSet<>();
for (Comment comment : comments) {
String message = comment.getMessage();
if (StringUtils.isNotBlank(message)) {
tokens.addAll(initSimpleKeywords(message));
}
}
return tokens;
}
private static String normalize(String string) {
return string.toLowerCase().replaceAll("[\0-/:-`{-¿]", "");
}
private static List<String> splitValues(String value) {
String initializedValue = value != null ? value : "";
return Arrays.asList(initializedValue.split("[ ,\\-._]+"));
}
private static Set<String> filterMinLength(Set<String> tokens) {
for (Iterator<String> iterator = tokens.iterator(); iterator.hasNext();) {
if (iterator.next().length() < 3) {
iterator.remove();
}
}
return tokens;
}
public String getSearch() {
Set<String> freeKeywords = new HashSet<>();
freeKeywords.addAll(titleKeywords);
freeKeywords.addAll(projectKeywords);
freeKeywords.addAll(batchKeywords);
freeKeywords.addAll(taskKeywords);
freeKeywords.addAll(metadataKeywords);
freeKeywords.addAll(metadataPseudoKeywords);
if (Objects.nonNull(processId)) {
freeKeywords.add(processId);
}
freeKeywords.addAll(commentKeywords);
return String.join(" ", freeKeywords);
}
public String getSearchTitle() {
return String.join(" ", titleKeywords);
}
public String getSearchProject() {
return String.join(" ", projectKeywords);
}
public String getSearchBatch() {
return String.join(" ", batchKeywords);
}
public String getSearchTask() {
Set<String> allTaskKeywords = new HashSet<>();
allTaskKeywords.addAll(taskKeywords);
allTaskKeywords.addAll(taskPseudoKeywords);
return String.join(" ", allTaskKeywords);
}
}
I have now been accused of not using the Hibernate Sarch framework and the functionality needs to be provided via annotations. I am very open to not reinventing an existing function, so how can I do that?
The following requirements need to be preserved for performance reasons:
- String
metaXml
must only be read once! rulesetLabelMap
must only be created once for a given file and use cache! (500k objects can be processed with the same file!)- all tokens must be normalized!
- follow the special rules, title must be cut into character strings that are not letters or numbers, and can be searched one-way or backwards (but not both together) but there must be at least 3 characters
- for the metadata search terms must be generated using the rule set (see Java code above)
- and the various search terms must be included in the joint search (“search”) but not all (see Java code above) and also separated
- and take into account that none of the caclulations are calculated twice!
Please let me know how I can implement this with the annotations offered by Hibernate Search guaranteed without any disadvantages!