From f2ff0e3dbcc0023883c35ba9e6186619c3eeca31 Mon Sep 17 00:00:00 2001 From: Alex Bogdanovski Date: Wed, 17 Jan 2018 01:56:18 +0200 Subject: [PATCH] added a new nested indexing mode to prevent a possible mapping explosion --- README.md | 33 ++ pom.xml | 2 +- .../erudika/para/search/ElasticSearch.java | 113 +++-- .../para/search/ElasticSearchUtils.java | 420 +++++++++++++++++- .../erudika/para/search/ElasticSearchIT.java | 115 +++++ 5 files changed, 633 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index f56b462..3c97497 100755 --- a/README.md +++ b/README.md @@ -74,6 +74,39 @@ para.search = "ElasticSearch" This could be a Java system property or part of a `application.conf` file on the classpath. This tells Para to use the Elasticsearch implementation instead of the default (Lucene). +### Indexing modes + +This plugin has two indexing modes: **normal** and **nested**. The nested mode was added after v1.28 to protect against +a possible [mapping explosion](https://discuss.elastic.co/t/can-nested-fields-prevent-mapping-explosion/95464) which +happens when there are lots of objects with lots of different custom properties in them. This overloads the Elasticsearch +index metadata and can crash the whole cluster. This indexing mode affects only custom properties in `Sysprop` objects. + +The old "normal" mode is suitable for most Para deployments, with just a few tenants or a single tenant +(one app per server). In this mode, Para objects are indexed without modification (all data types are preserved) +but this could lead to a mapping explosion. + +The nested data structure for these two indexing modes is shown below: +``` +// NORMAL MODE // NESTED MODE +{ { + "id": "123", "id": "123", + "appid": "para", "appid": "para", + "type": "custom", "type": "custom", + "properties": { "properties": [ + "key1": "value1", {"k": "key1", "v": "value1"}, + "key2": { {"k": "key2-subkey1", "v": "subValue1"}, + "subkey1": "subValue1" {"k": "numericKey3", "vn": 5} + }, ], + "numericKey3": 5 "_properties": "{\"key1\":\"value1\"}..." + } } +} +``` + +Switching to the new nested indexing mode is done with the configuration property: +``` +para.es.es.use_nested_custom_fields = true +``` + ### Calling Elasticsearch through the proxy endpoint You can directly call the Elasticsearch API through `/v1/_elasticsearch`. To enable it set `para.es.proxy_enabled = true` first. diff --git a/pom.xml b/pom.xml index 4528c9b..bb4177d 100644 --- a/pom.xml +++ b/pom.xml @@ -112,7 +112,7 @@ com.github.alexcojocaru elasticsearch-maven-plugin - 5.11 + 6.2 ${skipITs} false diff --git a/src/main/java/com/erudika/para/search/ElasticSearch.java b/src/main/java/com/erudika/para/search/ElasticSearch.java index c433ff7..15cea3d 100644 --- a/src/main/java/com/erudika/para/search/ElasticSearch.java +++ b/src/main/java/com/erudika/para/search/ElasticSearch.java @@ -25,16 +25,25 @@ import com.erudika.para.core.Tag; import com.erudika.para.core.utils.CoreUtils; import com.erudika.para.persistence.DAO; +import static com.erudika.para.search.ElasticSearchUtils.PROPS_PREFIX; +import static com.erudika.para.search.ElasticSearchUtils.PROPS_REGEX; +import static com.erudika.para.search.ElasticSearchUtils.convertQueryStringToNestedQuery; import static com.erudika.para.search.ElasticSearchUtils.getIndexName; +import static com.erudika.para.search.ElasticSearchUtils.getNestedKey; import static com.erudika.para.search.ElasticSearchUtils.getPager; import static com.erudika.para.search.ElasticSearchUtils.getTermsQuery; import static com.erudika.para.search.ElasticSearchUtils.getType; +import static com.erudika.para.search.ElasticSearchUtils.getValueFieldName; import static com.erudika.para.search.ElasticSearchUtils.isAsyncEnabled; +import static com.erudika.para.search.ElasticSearchUtils.keyValueBoolQuery; +import static com.erudika.para.search.ElasticSearchUtils.nestedMode; +import static com.erudika.para.search.ElasticSearchUtils.nestedPropsQuery; import static com.erudika.para.search.ElasticSearchUtils.qs; import com.erudika.para.utils.Config; import com.erudika.para.utils.Pager; import com.erudika.para.utils.Utils; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; @@ -65,7 +74,18 @@ import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item; import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; +import static org.elasticsearch.index.query.QueryBuilders.boolQuery; +import static org.elasticsearch.index.query.QueryBuilders.geoDistanceQuery; +import static org.elasticsearch.index.query.QueryBuilders.idsQuery; +import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery; +import static org.elasticsearch.index.query.QueryBuilders.matchQuery; +import static org.elasticsearch.index.query.QueryBuilders.moreLikeThisQuery; +import static org.elasticsearch.index.query.QueryBuilders.nestedQuery; +import static org.elasticsearch.index.query.QueryBuilders.prefixQuery; +import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery; +import static org.elasticsearch.index.query.QueryBuilders.termQuery; +import static org.elasticsearch.index.query.QueryBuilders.termsQuery; +import static org.elasticsearch.index.query.QueryBuilders.wildcardQuery; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHits; import org.elasticsearch.search.sort.SortBuilder; @@ -230,7 +250,7 @@ public void unindexAll(String appid, Map terms, boolean matchAll) { } QueryBuilder fb = (terms == null || terms.isEmpty()) ? - QueryBuilders.matchAllQuery() : getTermsQuery(terms, matchAll); + matchAllQuery() : getTermsQuery(terms, matchAll); SearchResponse scrollResp = client().prepareSearch(getIndexName(appid)) .setScroll(new TimeValue(60000)) .setQuery(fb) @@ -279,7 +299,7 @@ public

List

findByIds(String appid, List ids) return list; } try { - QueryBuilder qb = QueryBuilders.termsQuery(Config._ID, ids); + QueryBuilder qb = termsQuery(Config._ID, ids); return searchQuery(appid, null, qb); } catch (Exception e) { logger.warn(null, e); @@ -293,7 +313,18 @@ public

List

findTermInList(String appid, String type, if (StringUtils.isBlank(field) || terms == null) { return Collections.emptyList(); } - QueryBuilder qb = QueryBuilders.termsQuery(field, terms); + QueryBuilder qb; + if (nestedMode() && field.startsWith(PROPS_PREFIX)) { + QueryBuilder bfb = null; + BoolQueryBuilder fb = boolQuery(); + for (Object term : terms) { + bfb = keyValueBoolQuery(field, String.valueOf(term)); + fb.should(bfb); + } + qb = nestedPropsQuery(terms.size() > 1 ? fb : bfb); + } else { + qb = termsQuery(field, terms); + } return searchQuery(appid, type, qb, pager); } @@ -303,7 +334,13 @@ public

List

findPrefix(String appid, String type, if (StringUtils.isBlank(field) || StringUtils.isBlank(prefix)) { return Collections.emptyList(); } - return searchQuery(appid, type, QueryBuilders.prefixQuery(field, prefix), pager); + QueryBuilder qb; + if (nestedMode() && field.startsWith(PROPS_PREFIX)) { + qb = nestedPropsQuery(keyValueBoolQuery(field, prefixQuery(getValueFieldName(prefix), prefix))); + } else { + qb = prefixQuery(field, prefix); + } + return searchQuery(appid, type, qb, pager); } @Override @@ -312,7 +349,14 @@ public

List

findQuery(String appid, String type, if (StringUtils.isBlank(query)) { return Collections.emptyList(); } - QueryBuilder qb = QueryBuilders.queryStringQuery(qs(query)).allowLeadingWildcard(false); + // a basic implementation of support for nested queries in query string + // https://github.com/elastic/elasticsearch/issues/11322 + QueryBuilder qb; + if (nestedMode() && query.matches(PROPS_REGEX)) { + qb = convertQueryStringToNestedQuery(query); + } else { + qb = queryStringQuery(qs(query)).allowLeadingWildcard(false); + } return searchQuery(appid, type, qb, pager); } @@ -323,7 +367,7 @@ public

List

findNestedQuery(String appid, String type, return Collections.emptyList(); } String queryString = "nstd." + field + ":" + query; - QueryBuilder qb = QueryBuilders.nestedQuery("nstd", QueryBuilders.queryStringQuery(qs(queryString)), Avg); + QueryBuilder qb = nestedQuery("nstd", queryStringQuery(qs(queryString)), Avg); return searchQuery(appid, type, qb, pager); } @@ -333,7 +377,12 @@ public

List

findWildcard(String appid, String type, if (StringUtils.isBlank(field) || StringUtils.isBlank(wildcard)) { return Collections.emptyList(); } - QueryBuilder qb = QueryBuilders.wildcardQuery(field, wildcard); + QueryBuilder qb; + if (nestedMode() && field.startsWith(PROPS_PREFIX)) { + qb = nestedPropsQuery(keyValueBoolQuery(field, wildcardQuery(getValueFieldName(wildcard), wildcard))); + } else { + qb = wildcardQuery(field, wildcard); + } return searchQuery(appid, type, qb, pager); } @@ -344,10 +393,10 @@ public

List

findTagged(String appid, String type, return Collections.emptyList(); } - BoolQueryBuilder tagFilter = QueryBuilders.boolQuery(); + BoolQueryBuilder tagFilter = boolQuery(); //assuming clean & safe tags here for (String tag : tags) { - tagFilter.must(QueryBuilders.termQuery(Config._TAGS, tag)); + tagFilter.must(termQuery(Config._TAGS, tag)); } // The filter looks like this: ("tag1" OR "tag2" OR "tag3") AND "type" return searchQuery(appid, type, tagFilter, pager); @@ -379,14 +428,26 @@ public

List

findSimilar(String appid, String type, Str QueryBuilder qb; if (fields == null || fields.length == 0) { - qb = QueryBuilders.moreLikeThisQuery(new String[]{liketext}).minDocFreq(1).minTermFreq(1); + qb = moreLikeThisQuery(new String[]{liketext}).minDocFreq(1).minTermFreq(1); } else { - qb = QueryBuilders.moreLikeThisQuery(fields, new String[]{liketext}, Item.EMPTY_ARRAY). - minDocFreq(1).minTermFreq(1); + boolean containsNestedProps = Arrays.stream(fields).anyMatch((f) -> StringUtils.startsWith(f, PROPS_PREFIX)); + if (nestedMode() && containsNestedProps) { + BoolQueryBuilder bqb = boolQuery(); + for (String field : fields) { + QueryBuilder kQuery = matchQuery(PROPS_PREFIX + "k", getNestedKey(field)); + QueryBuilder vQuery = moreLikeThisQuery(new String[]{PROPS_PREFIX + "v"}, + new String[]{liketext}, Item.EMPTY_ARRAY).minDocFreq(1).minTermFreq(1); + bqb.should(nestedPropsQuery(boolQuery().must(kQuery).must(vQuery))); + } + qb = bqb; + } else { + qb = moreLikeThisQuery(fields, new String[]{liketext}, Item.EMPTY_ARRAY). + minDocFreq(1).minTermFreq(1); + } } if (!StringUtils.isBlank(filterKey)) { - qb = QueryBuilders.boolQuery().mustNot(QueryBuilders.termQuery(Config._ID, filterKey)).filter(qb); + qb = boolQuery().mustNot(termQuery(Config._ID, filterKey)).filter(qb); } return searchQuery(appid, searchQueryRaw(appid, type, qb, pager)); } @@ -396,7 +457,7 @@ public

List

findTags(String appid, String keyword, Pag if (StringUtils.isBlank(keyword)) { return Collections.emptyList(); } - QueryBuilder qb = QueryBuilders.wildcardQuery("tag", keyword.concat("*")); + QueryBuilder qb = wildcardQuery("tag", keyword.concat("*")); return searchQuery(appid, Utils.type(Tag.class), qb, pager); } @@ -410,11 +471,9 @@ public

List

findNearby(String appid, String type, if (StringUtils.isBlank(query)) { query = "*"; } - Pager page = getPager(pager); // find nearby Address objects - QueryBuilder qb1 = QueryBuilders.geoDistanceQuery("latlng").point(lat, lng). - distance(radius, DistanceUnit.KILOMETERS); - + Pager page = getPager(pager); + QueryBuilder qb1 = geoDistanceQuery("latlng").point(lat, lng).distance(radius, DistanceUnit.KILOMETERS); SearchHits hits1 = searchQueryRaw(appid, Utils.type(Address.class), qb1, page); page.setLastKey(null); // will cause problems if not cleared @@ -435,12 +494,8 @@ public

List

findNearby(String appid, String type, } } - QueryBuilder qb2 = QueryBuilders.boolQuery(). - must(QueryBuilders.queryStringQuery(qs(query))). - filter(QueryBuilders.idsQuery().addIds(parentids)); - + QueryBuilder qb2 = boolQuery().must(queryStringQuery(qs(query))).filter(idsQuery().addIds(parentids)); SearchHits hits2 = searchQueryRaw(appid, type, qb2, page); - return searchQuery(appid, hits2); } @@ -524,10 +579,10 @@ protected SearchHits searchQueryRaw(String appid, String type, QueryBuilder quer int start = (pageNum < 1 || pageNum > Config.MAX_PAGES) ? 0 : (pageNum - 1) * max; if (query == null) { - query = QueryBuilders.matchAllQuery(); + query = matchAllQuery(); } if (!StringUtils.isBlank(type)) { - query = QueryBuilders.boolQuery().must(query).must(QueryBuilders.termQuery(Config._TYPE, type)); + query = boolQuery().must(query).must(termQuery(Config._TYPE, type)); } SearchHits hits = null; @@ -604,9 +659,9 @@ public Long getCount(String appid, String type) { } QueryBuilder query; if (!StringUtils.isBlank(type)) { - query = QueryBuilders.termQuery(Config._TYPE, type); + query = termQuery(Config._TYPE, type); } else { - query = QueryBuilders.matchAllQuery(); + query = matchAllQuery(); } Long count = 0L; try { @@ -629,7 +684,7 @@ public Long getCount(String appid, String type, Map terms) { QueryBuilder query = getTermsQuery(terms, true); if (query != null) { if (!StringUtils.isBlank(type)) { - query = QueryBuilders.boolQuery().must(query).must(QueryBuilders.termQuery(Config._TYPE, type)); + query = boolQuery().must(query).must(termQuery(Config._TYPE, type)); } try { SearchRequestBuilder crb = client().prepareSearch(getIndexName(appid)).setSize(0).setQuery(query); diff --git a/src/main/java/com/erudika/para/search/ElasticSearchUtils.java b/src/main/java/com/erudika/para/search/ElasticSearchUtils.java index a6cca69..917d38e 100644 --- a/src/main/java/com/erudika/para/search/ElasticSearchUtils.java +++ b/src/main/java/com/erudika/para/search/ElasticSearchUtils.java @@ -19,6 +19,7 @@ import com.erudika.para.core.App; import com.erudika.para.core.ParaObject; +import com.erudika.para.core.Sysprop; import com.erudika.para.core.utils.ParaObjectUtils; import com.erudika.para.persistence.DAO; import com.erudika.para.utils.Config; @@ -29,13 +30,26 @@ import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; +import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.search.WildcardQuery; +import static org.apache.lucene.search.join.ScoreMode.Avg; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.DocWriteResponse; import org.elasticsearch.action.admin.cluster.node.info.NodeInfo; @@ -53,8 +67,18 @@ import org.elasticsearch.common.transport.TransportAddress; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.BoolQueryBuilder; +import org.elasticsearch.index.query.NestedQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; +import static org.elasticsearch.index.query.QueryBuilders.boolQuery; +import static org.elasticsearch.index.query.QueryBuilders.fuzzyQuery; +import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery; +import static org.elasticsearch.index.query.QueryBuilders.matchQuery; +import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery; +import static org.elasticsearch.index.query.QueryBuilders.nestedQuery; +import static org.elasticsearch.index.query.QueryBuilders.prefixQuery; +import static org.elasticsearch.index.query.QueryBuilders.rangeQuery; +import static org.elasticsearch.index.query.QueryBuilders.termQuery; +import static org.elasticsearch.index.query.QueryBuilders.wildcardQuery; import org.elasticsearch.index.query.RangeQueryBuilder; import org.elasticsearch.search.sort.SortBuilder; import org.elasticsearch.search.sort.SortBuilders; @@ -73,14 +97,34 @@ public final class ElasticSearchUtils { private static TransportClient searchClient; private static final String DATE_FORMAT = "epoch_millis||epoch_second||yyyy-MM-dd HH:mm:ss||" + "yyyy-MM-dd||yyyy/MM/dd||yyyyMMdd||yyyy"; + + static final String PROPS_FIELD = "properties"; + static final String PROPS_PREFIX = PROPS_FIELD + "."; + static final String PROPS_JSON = "_" + PROPS_FIELD; + static final String PROPS_REGEX = "(^|.*\\W)" + PROPS_FIELD + "[\\.\\:].+"; + + /** + * Switches between normal indexing and indexing with nested key/value objects for Sysprop.properties. + * When this is 'false' (normal mode), Para objects will be indexed without modification but this could lead to + * a field mapping explosion and crash the ES cluster. + * + * When set to 'true' (nested mode), Para objects will be indexed with all custom fields flattened to an array of + * key/value properties: properties: [{"k": "field", "v": "value"},...]. This is done for Sysprop objects with + * containing custom properties. This mode prevents an eventual field mapping explosion. + */ + static boolean nestedMode() { + return Config.getConfigBoolean("es.use_nested_custom_fields", false); + } + /** * A list of default mappings that are defined upon index creation. */ - private static final String DEFAULT_MAPPING = - "{\n" + + private static String getDefaultMapping() { + return "{\n" + " \"_default_\": {\n" + " \"properties\": {\n" + " \"nstd\": {\"type\": \"nested\"},\n" + + " \"properties\": {\"type\": \"" + (nestedMode() ? "nested" : "object") + "\"},\n" + " \"latlng\": {\"type\": \"geo_point\"},\n" + " \"_docid\": {\"type\": \"long\", \"index\": false},\n" + " \"updated\": {\"type\": \"date\", \"format\" : \"" + DATE_FORMAT + "\"},\n" + @@ -103,6 +147,19 @@ public final class ElasticSearchUtils { " }\n" + " }\n" + "}"; + } + + /** + * These fields are not indexed. + */ + private static final String[] IGNORED_FIELDS = new String[] { + "settings", // App + "datatypes", // App + "deviceState", // Thing + "deviceMetadata", // Thing + "resourcePermissions", // App + "validationConstraints" // App + }; private ElasticSearchUtils() { } @@ -189,7 +246,7 @@ private static boolean createIndexWithoutAlias(String name, int shards, int repl setSettings(settings.build()); // default system mapping (all the rest are dynamic) - create.addMapping("_default_", DEFAULT_MAPPING, XContentType.JSON); + create.addMapping("_default_", getDefaultMapping(), XContentType.JSON); create.execute().actionGet(); logger.info("Created a new index '{}' with {} shards, {} replicas.", name, shards, replicas); } catch (Exception e) { @@ -432,7 +489,7 @@ public static boolean addIndexAlias(String indexName, String aliasName, boolean if (withAliasRouting) { aliasBuilder = AliasActions.add().index(index).alias(alias). searchRouting(alias).indexRouting(alias). - filter(QueryBuilders.termQuery(Config._APPID, aliasName)); // DO NOT trim filter query! + filter(termQuery(Config._APPID, aliasName)); // DO NOT trim filter query! } else { aliasBuilder = AliasActions.add().index(index).alias(alias); } @@ -598,7 +655,7 @@ static boolean isAsyncEnabled() { * @return the filter */ static QueryBuilder getTermsQuery(Map terms, boolean mustMatchAll) { - BoolQueryBuilder fb = QueryBuilders.boolQuery(); + BoolQueryBuilder fb = boolQuery(); int addedTerms = 0; boolean noop = true; QueryBuilder bfb = null; @@ -611,18 +668,13 @@ static QueryBuilder getTermsQuery(Map terms, boolean mustMatchAll) { continue; } Matcher matcher = Pattern.compile(".*(<|>|<=|>=)$").matcher(term.getKey().trim()); - bfb = QueryBuilders.termQuery(term.getKey(), stringValue); if (matcher.matches()) { - String key = term.getKey().replaceAll("[<>=\\s]+$", ""); - RangeQueryBuilder rfb = QueryBuilders.rangeQuery(key); - if (">".equals(matcher.group(1))) { - bfb = rfb.gt(stringValue); - } else if ("<".equals(matcher.group(1))) { - bfb = rfb.lt(stringValue); - } else if (">=".equals(matcher.group(1))) { - bfb = rfb.gte(stringValue); - } else if ("<=".equals(matcher.group(1))) { - bfb = rfb.lte(stringValue); + bfb = range(matcher.group(1), term.getKey(), stringValue); + } else { + if (nestedMode() && term.getKey().startsWith(PROPS_PREFIX)) { + bfb = nestedPropsQuery(keyValueBoolQuery(term.getKey(), stringValue)); + } else { + bfb = termQuery(term.getKey(), stringValue); } } if (mustMatchAll) { @@ -664,11 +716,26 @@ static String qs(String query) { return query.trim(); } + static Query qsParsed(String query) { + if (StringUtils.isBlank(query) || "*".equals(query.trim())) { + return null; + } + try { + StandardQueryParser parser = new StandardQueryParser(); + parser.setAllowLeadingWildcard(false); + return parser.parse(query, ""); + } catch (Exception ex) { + logger.warn("Failed to parse query string '{}'.", query); + } + return null; + } + /** * Converts a {@link ParaObject} to a map of fields and values. * @param po an object * @return a map of keys and values */ + @SuppressWarnings("unchecked") static Map getSourceFromParaObject(ParaObject po) { if (po == null) { return Collections.emptyMap(); @@ -676,11 +743,70 @@ static Map getSourceFromParaObject(ParaObject po) { Map data = ParaObjectUtils.getAnnotatedFields(po, null, false); Map source = new HashMap<>(data.size() + 1); source.putAll(data); + if (nestedMode() && po instanceof Sysprop) { + try { + List> keysAndValues = new LinkedList<>(); + Map props = (Map) data.get(PROPS_FIELD); + // flatten properites object to array of keys/values, to prevent field mapping explosion + addFieldsToSource(props, keysAndValues, ""); + source.put(PROPS_FIELD, keysAndValues); // overwrite properties object with flattened array + // special field for holding the original sysprop.properties map as JSON string + source.put(PROPS_JSON, ParaObjectUtils.getJsonWriterNoIdent().writeValueAsString(props)); + } catch (Exception e) { + logger.error(null, e); + } + } + for (String field : IGNORED_FIELDS) { + source.remove(field); + } // special DOC ID field used in "search after" source.put("_docid", NumberUtils.toLong(Utils.getNewId())); return source; } + /** + * Flattens a complex object like a property Map ({@code Sysprop.getProperties()}) to a list of key/value pairs. + * Rearranges properites to prevent field mapping explosion, for example: + * properties: [{k: key1, v: value1}, {k: key2, v: value2}...] + * @param objectData original object properties + * @param keysAndValues a list of key/value objects, each containing one property + * @param fieldPrefix a field prefix, e.g. "properties.key" + */ + @SuppressWarnings("unchecked") + private static void addFieldsToSource(Map objectData, List> keysAndValues, + String fieldPrefix) { + if (objectData == null || keysAndValues == null) { + return; + } + for (Entry entry : objectData.entrySet()) { + String pre = (StringUtils.isBlank(fieldPrefix) ? "" : fieldPrefix + "-"); + String field = pre + entry.getKey(); + Object value = entry.getValue(); + if (value != null) { + Map propMap = new HashMap(2); + propMap.put("k", field); + if (value instanceof Map) { + // recursively flatten all nested objects + addFieldsToSource((Map) value, keysAndValues, pre + field); + } else if (value instanceof List) { + // input array: key: [value1, value2] + // flattened array: [{k: key.0, v: value1}, {k: key.1, v: value2}] + for (int i = 0; i < ((List) value).size(); i++) { + Object val = ((List) value).get(i); + addFieldsToSource(Collections.singletonMap(String.valueOf(i), val), keysAndValues, pre + field); + } + } else if (value instanceof Number) { + propMap.put("vn", value); + keysAndValues.add(propMap); + } else { + // boolean and Date data types are ommited for simplicity + propMap.put("v", String.valueOf(value)); + keysAndValues.add(propMap); + } + } + } + } + /** * Converts the source of an ES document to {@link ParaObject}. * @param

object type @@ -693,17 +819,271 @@ static

P getParaObjectFromSource(Map sour } Map data = new HashMap<>(source.size()); data.putAll(source); + // retrieve the JSON for the original properties field and deserialize it + if (nestedMode() && data.containsKey(PROPS_JSON)) { + try { + Map props = ParaObjectUtils.getJsonReader(Map.class). + readValue((String) data.get(PROPS_JSON)); + data.put(PROPS_FIELD, props); + } catch (Exception e) { + logger.error(null, e); + } + data.remove(PROPS_JSON); + } data.remove("_docid"); return ParaObjectUtils.setAnnotatedFields(data); } + /** + * @param operator operator <,>,<=,>= + * @param field field name + * @param stringValue field value + * @return a range query + */ + static QueryBuilder range(String operator, String field, String stringValue) { + String key = StringUtils.replaceAll(field, "[<>=\\s]+$", ""); + boolean nestedMode = nestedMode() && field.startsWith(PROPS_PREFIX); + RangeQueryBuilder rfb = rangeQuery(nestedMode ? getValueFieldName(stringValue) : key); + if (">".equals(operator)) { + rfb.gt(getNumericValue(stringValue)); + } else if ("<".equals(operator)) { + rfb.lt(getNumericValue(stringValue)); + } else if (">=".equals(operator)) { + rfb.gte(getNumericValue(stringValue)); + } else if ("<=".equals(operator)) { + rfb.lte(getNumericValue(stringValue)); + } + if (nestedMode) { + return nestedPropsQuery(keyValueBoolQuery(key, stringValue, rfb)); + } else { + return rfb; + } + } + + /** + * @param query query string + * @return a list of composite queries for matching nested objects + */ + static QueryBuilder convertQueryStringToNestedQuery(String query) { + Query q = qsParsed(StringUtils.trimToEmpty(query)); + if (q == null) { + return matchAllQuery(); + } + return rewriteQuery(q); + } + + /** + * @param q parsed Lucene query string query + * @return a rewritten query with nested queries for custom properties (when in nested mode) + */ + private static QueryBuilder rewriteQuery(Query q) { + QueryBuilder qb = null; + if (q instanceof BooleanQuery) { + qb = boolQuery(); + for (BooleanClause clause : ((BooleanQuery) q).clauses()) { + switch (clause.getOccur()) { + case MUST: + ((BoolQueryBuilder) qb).must(rewriteQuery(clause.getQuery())); + break; + case MUST_NOT: + ((BoolQueryBuilder) qb).mustNot(rewriteQuery(clause.getQuery())); + break; + case FILTER: + ((BoolQueryBuilder) qb).filter(rewriteQuery(clause.getQuery())); + break; + case SHOULD: + default: + ((BoolQueryBuilder) qb).should(rewriteQuery(clause.getQuery())); + } + } + } else if (q instanceof TermRangeQuery) { + qb = termRange(q); + } else if (q instanceof BoostQuery) { + qb = rewriteQuery(((BoostQuery) q).getQuery()).boost(((BoostQuery) q).getBoost()); + } else if (q instanceof TermQuery) { + qb = term(q); + } else if (q instanceof FuzzyQuery) { + qb = fuzzy(q); + } else if (q instanceof PrefixQuery) { + qb = prefix(q); + } else if (q instanceof WildcardQuery) { + qb = wildcard(q); + } else { + logger.warn("Unknown query type in nested mode query syntax: {}", q.getClass()); + } + return (qb == null) ? matchAllQuery() : qb; + } + + private static QueryBuilder termRange(Query q) { + QueryBuilder qb = null; + TermRangeQuery trq = (TermRangeQuery) q; + if (!StringUtils.isBlank(trq.getField())) { + String from = trq.getLowerTerm() != null ? Term.toString(trq.getLowerTerm()) : "*"; + String to = trq.getUpperTerm() != null ? Term.toString(trq.getUpperTerm()) : "*"; + boolean nestedMode = nestedMode() && trq.getField().matches(PROPS_REGEX); + qb = rangeQuery(nestedMode ? getValueFieldNameFromRange(from, to) : trq.getField()); + if ("*".equals(from) && "*".equals(to)) { + qb = matchAllQuery(); + } + if (!"*".equals(from)) { + ((RangeQueryBuilder) qb).from(getNumericValue(from)).includeLower(trq.includesLower()); + } + if (!"*".equals(to)) { + ((RangeQueryBuilder) qb).to(getNumericValue(to)).includeUpper(trq.includesUpper()); + } + if (nestedMode) { + qb = nestedPropsQuery(keyValueBoolQuery(trq.getField(), qb)); + } + } + return qb; + } + + private static QueryBuilder term(Query q) { + QueryBuilder qb; + String field = ((TermQuery) q).getTerm().field(); + String value = ((TermQuery) q).getTerm().text(); + if (StringUtils.isBlank(field)) { + qb = multiMatchQuery(value); + } else if (nestedMode() && field.matches(PROPS_REGEX)) { + qb = nestedPropsQuery(keyValueBoolQuery(field, value)); + } else { + qb = termQuery(field, value); + } + return qb; + } + + private static QueryBuilder fuzzy(Query q) { + QueryBuilder qb; + String field = ((FuzzyQuery) q).getTerm().field(); + String value = ((FuzzyQuery) q).getTerm().text(); + if (StringUtils.isBlank(field)) { + qb = multiMatchQuery(value); + } else if (nestedMode() && field.matches(PROPS_REGEX)) { + qb = nestedPropsQuery(keyValueBoolQuery(field, fuzzyQuery(getValueFieldName(value), value))); + } else { + qb = fuzzyQuery(field, value); + } + return qb; + } + + private static QueryBuilder prefix(Query q) { + QueryBuilder qb; + String field = ((PrefixQuery) q).getPrefix().field(); + String value = ((PrefixQuery) q).getPrefix().text(); + if (StringUtils.isBlank(field)) { + qb = multiMatchQuery(value); + } else if (nestedMode() && field.matches(PROPS_REGEX)) { + qb = nestedPropsQuery(keyValueBoolQuery(field, prefixQuery(getValueFieldName(value), value))); + } else { + qb = prefixQuery(field, value); + } + return qb; + } + + private static QueryBuilder wildcard(Query q) { + QueryBuilder qb; + String field = ((WildcardQuery) q).getTerm().field(); + String value = ((WildcardQuery) q).getTerm().text(); + if (StringUtils.isBlank(field)) { + qb = multiMatchQuery(value); + } else if (nestedMode() && field.matches(PROPS_REGEX)) { + qb = nestedPropsQuery(keyValueBoolQuery(field, wildcardQuery(getValueFieldName(value), value))); + } else { + qb = wildcardQuery(field, value); + } + return qb; + } + + /** + * @param k field name + * @param query query object + * @return a composite query: bool(match(key) AND match(value)) + */ + static QueryBuilder keyValueBoolQuery(String k, QueryBuilder query) { + return keyValueBoolQuery(k, null, query); + } + + /** + * @param k field name + * @param v field value + * @return a composite query: bool(match(key) AND match(value)) + */ + static QueryBuilder keyValueBoolQuery(String k, String v) { + return keyValueBoolQuery(k, v, null); + } + + /** + * @param k field name + * @param v field value + * @param query query object + * @return a composite query: bool(match(key) AND match(value)) + */ + static QueryBuilder keyValueBoolQuery(String k, String v, QueryBuilder query) { + if (StringUtils.isBlank(k) || (query == null && StringUtils.isBlank(v))) { + return matchAllQuery(); + } + QueryBuilder kQuery = matchQuery(PROPS_PREFIX + "k", getNestedKey(k)); + QueryBuilder vQuery = (query == null) ? matchQuery(getValueFieldName(v), v) : query; + if ("*".equals(v) || matchAllQuery().equals(query)) { + return boolQuery().must(kQuery); + } + return boolQuery().must(kQuery).must(vQuery); + } + + /** + * @param query query + * @return a nested query + */ + static NestedQueryBuilder nestedPropsQuery(QueryBuilder query) { + return nestedQuery(PROPS_FIELD, query, Avg); + } + + /** + * @param key dotted field path + * @return translate "properties.path.to.key" to "properties.path-to-key" + */ + static String getNestedKey(String key) { + if (StringUtils.startsWith(key, PROPS_PREFIX)) { + return StringUtils.removeStart(key, PROPS_PREFIX).replaceAll("\\.", "-"); + } + return key; + } + + /** + * @param v search term + * @return the name of the value property inside a nested object, e.g. "properties.v" + */ + static String getValueFieldName(String v) { + return PROPS_PREFIX + (NumberUtils.isDigits(v) ? "vn" : "v"); + } + + /** + * @param from from value + * @param to to value + * @return either "properties.vn" if one of the range limits is a number, or "properties.v" otherwise. + */ + static String getValueFieldNameFromRange(String from, String to) { + if (("*".equals(from) && "*".equals(to)) || NumberUtils.isDigits(from) || NumberUtils.isDigits(to)) { + return PROPS_PREFIX + "vn"; + } + return PROPS_PREFIX + "v"; + } + + /** + * @param v search term + * @return the long value of v if it is a number + */ + static Object getNumericValue(String v) { + return NumberUtils.isDigits(v) ? NumberUtils.toLong(v, 0) : v; + } + /** * A method reserved for future use. It allows to have indexes with different names than the appid. * * @param appid an app identifer * @return the correct index name */ - protected static String getIndexName(String appid) { + static String getIndexName(String appid) { return appid.trim(); } @@ -711,7 +1091,7 @@ protected static String getIndexName(String appid) { * Para indices have 1 type only - "paraobject". From v6 onwards, ES allows only 1 type per index. * @return "paraobject" */ - protected static String getType() { + static String getType() { return ParaObject.class.getSimpleName().toLowerCase(); } @@ -719,7 +1099,7 @@ protected static String getType() { * @param indexName index name or alias * @return e.g. "index-name_*" */ - protected static String getIndexNameWithWildcard(String indexName) { + static String getIndexNameWithWildcard(String indexName) { return StringUtils.contains(indexName, "_") ? indexName : indexName + "_*"; // ES v6 } } diff --git a/src/test/java/com/erudika/para/search/ElasticSearchIT.java b/src/test/java/com/erudika/para/search/ElasticSearchIT.java index 6ae9bd2..cb5c6fe 100644 --- a/src/test/java/com/erudika/para/search/ElasticSearchIT.java +++ b/src/test/java/com/erudika/para/search/ElasticSearchIT.java @@ -20,9 +20,12 @@ import com.erudika.para.core.App; import com.erudika.para.core.ParaObject; import com.erudika.para.core.Sysprop; +import static com.erudika.para.search.SearchTest.appid1; import static com.erudika.para.search.SearchTest.u; import com.erudika.para.utils.Config; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -170,4 +173,116 @@ public void testSharedIndex() throws InterruptedException { ElasticSearchUtils.removeIndexAlias(root, app1); ElasticSearchUtils.removeIndexAlias(root, app2); } + + @Test + public void testNestedIndexing() throws InterruptedException { + System.setProperty("para.es.use_nested_custom_fields", "true"); + String indexInNestedMode = "app-nested-mode"; + ElasticSearchUtils.createIndex(indexInNestedMode); + String type = "cat"; + Sysprop c1 = new Sysprop("c1"); + Sysprop c2 = new Sysprop("c2"); + Sysprop c3 = new Sysprop("c3"); + + c1.setType(type); + c2.setType(type); + c3.setType(type); + c1.setName("Kitty 1"); + c2.setName("Kitty 2"); + c3.setName("Kitty 3"); + c1.setAppid(indexInNestedMode); + c2.setAppid(indexInNestedMode); + c3.setAppid(indexInNestedMode); + c1.setTimestamp(12345678L); + c2.setTimestamp(123456789L); + c3.setTimestamp(1234567890L); + + Map owner1 = new HashMap<>(); + Map owner2 = new HashMap<>(); + Map owner3 = new HashMap<>(); + owner1.put("name", "Alice"); + owner2.put("name", "Bob"); + owner3.put("name", "Chris"); + owner1.put("age", 33); + owner2.put("age", 34); + owner3.put("age", 35); + + c1.addProperty("owner", owner1); + c2.addProperty("owner", owner2); + c3.addProperty("owner", owner3); + + c1.addProperty("text", "This is a little test sentence. Testing, one, two, three."); + c2.addProperty("text", "We are testing this thing. This sentence is a test. One, two."); + c3.addProperty("text", "totally different text - kitty 3."); + + s.index(indexInNestedMode, c1); + s.index(indexInNestedMode, c2); + s.index(indexInNestedMode, c3); + + Thread.sleep(1000); + + // findTermInList + ArrayList terms1 = new ArrayList(); + terms1.add("alice"); + terms1.add("bob"); + List r1 = s.findTermInList(indexInNestedMode, "cat", "properties.owner.name", terms1); + assertEquals(2, r1.size()); + assertTrue(r1.stream().noneMatch((c) -> c.getId().equals("c3"))); + assertTrue(s.findTermInList(indexInNestedMode, "cat", "properties.owner.name", Collections.singletonList("Bo")).isEmpty()); + + // findPrefix + List r2 = s.findPrefix(indexInNestedMode, "cat", "properties.owner.name", "ali"); + assertEquals(1, r2.size()); + assertTrue(r2.get(0).getName().equals("Kitty 1")); + assertTrue(s.findPrefix(indexInNestedMode, "cat", "properties.owner.name", "Deb").isEmpty()); + + // findQuery + List r31 = s.findQuery(indexInNestedMode, "cat", "\"Kitty 2\" AND properties.owner.age:34"); + List r32 = s.findQuery(indexInNestedMode, "cat", "timestamp:{12345678 TO *} AND properties.owner.age:{* TO 34]"); + List r33 = s.findQuery(indexInNestedMode, "cat", "-properties.owner.age:[* TO 33]"); + List r34 = s.findQuery(indexInNestedMode, "cat", "chris"); + List r35 = s.findQuery(indexInNestedMode, "cat", "properties.owner.age:*"); + assertTrue(s.findQuery(indexInNestedMode, "cat", "dog AND properties.owner.age:34").isEmpty()); + assertEquals(1, s.findQuery(indexInNestedMode, "cat", "dog OR properties.owner.age:34").size()); + assertEquals(3, s.findQuery(indexInNestedMode, "cat", "properties.owner.name:[alice TO chris]").size()); + assertEquals(2, s.findQuery(indexInNestedMode, "cat", "properties.owner.name:[alice TO chris}").size()); + assertEquals(1, s.findQuery(indexInNestedMode, "cat", "properties.owner.name:{alice TO chris}").size()); + assertEquals(1, r31.size()); + assertEquals("c2", r31.get(0).getId()); + assertEquals(1, r32.size()); + assertEquals("c2", r32.get(0).getId()); + assertEquals(2, r33.size()); + assertTrue(r33.stream().allMatch((c) -> c.getId().equals("c2") || c.getId().equals("c3"))); + assertEquals(1, r34.size()); + assertEquals("c3", r34.get(0).getId()); + assertEquals(3, r35.size()); + + // findWildcard + assertEquals(1, s.findWildcard(indexInNestedMode, "cat", "properties.owner.name", "ali*").size()); + assertEquals(1, s.findWildcard(indexInNestedMode, "", "properties.owner.name", "chr*").size()); + assertEquals(0, s.findWildcard(indexInNestedMode, "cat", "", "ali*").size()); + assertEquals(2, s.findWildcard(indexInNestedMode, null, "properties.text", "test*").size()); + + // findTerms + Map terms = new HashMap(); + terms.put("timestamp>", 12345678); + terms.put("properties.owner.age>=", 33); + assertEquals(2, s.findTerms(indexInNestedMode, "cat", terms, true).size()); + assertEquals(3, s.findTerms(indexInNestedMode, "cat", terms, false).size()); + + // findSimilar + assertTrue(s.findSimilar(indexInNestedMode, "cat", "", null, null).isEmpty()); + assertTrue(s.findSimilar(indexInNestedMode, "cat", "c3", new String[]{"properties.text"}, + (String) c3.getProperty("text")).isEmpty()); + assertTrue(s.findSimilar(indexInNestedMode, "cat", "", new String[0], "").isEmpty()); + List res = s.findSimilar(indexInNestedMode, "cat", "c1", + new String[]{"properties.text"}, (String) c2.getProperty("text")); + assertFalse(res.isEmpty()); + assertEquals(c2, res.get(0)); + + + s.unindexAll(indexInNestedMode, Arrays.asList(c1, c2, c3)); + ElasticSearchUtils.deleteIndex(indexInNestedMode); + System.setProperty("para.es.use_nested_custom_fields", "false"); + } } \ No newline at end of file