@elhipernauta
slides.com/elhipernauta/elasticsearch/live
Stop future proofing software
or start doing it the right way
George Hosu
La contracultura minimalista
RubyConf Uruguay
Michel Martens
Command-line Tools can be 235x Faster than your Hadoop Cluster
Adam Drake
Windows Subsystem for Linux
Microsoft
$ echo '{"count": 598438811}' | jq .count
598438811
$ ls /usr/bin/csv*
csvclean csvcut csvformat csvgrep csvjoin csvjson
csvlook csvpy csvsort csvsql csvstack csvstat
datos.gob.ar
Representational State Transfer
curl -X POST \
-H "Content-Type: application/json" \
-d '{"message": "Hello"}' \
http://localhost/speak
< HTTP/1.1 200
< Content-Type: application/json; charset=UTF-8
{
"id": 9494818210
}
Método
Tipo de Mensaje
Mensaje
Recurso
Estado
Tipo de Respuesta
Respuesta
AWS && 90%
sudo docker run \
-p 9200:9200 \
-p 9300:9300 \
-v /home/mariano/elasticsearch_data:/usr/share/elasticsearch/data
-e "discovery.type=single-node" \
docker.elastic.co/elasticsearch/elasticsearch:6.4.2
REST
Cluster
Cluster folder
curl -X GET 'http://localhost:9200'
{
"name" : "fvOoagB",
"cluster_name" : "docker-cluster",
"cluster_uuid" : "3yQipWUIS_uI1Xcch5sr5Q",
"version" : {
"number" : "6.4.2",
"build_flavor" : "default",
"build_type" : "tar",
"build_hash" : "04711c2",
"build_date" : "2018-09-26T13:34:09.098244Z",
"build_snapshot" : false,
"lucene_version" : "7.4.0",
"minimum_wire_compatibility_version" : "5.6.0",
"minimum_index_compatibility_version" : "5.0.0"
},
"tagline" : "You Know, for Search"
}
Puerto
Método
curl -X GET 'http://localhost:9200/_cluster/health?pretty'
{
"cluster_name" : "docker-cluster",
"status" : "green",
"timed_out" : false,
"number_of_nodes" : 1,
"number_of_data_nodes" : 1,
"active_primary_shards" : 0,
"active_shards" : 0,
"relocating_shards" : 0,
"initializing_shards" : 0,
"unassigned_shards" : 0,
"delayed_unassigned_shards" : 0,
"number_of_pending_tasks" : 0,
"number_of_in_flight_fetch" : 0,
"task_max_waiting_in_queue_millis" : 0,
"active_shards_percent_as_number" : 100.0
}
$ jq
Estado del cluster
curl -H 'Content-type: application/json' \
-X PUT \
'http://localhost:9200/locations' \
-d '{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
}
}'
{
"acknowledged": true,
"shards_acknowledged": true,
"index": "locations"
}
Índice
NODO 1
1
curl -X DELETE 'http://localhost:9200/locations'
{
"acknowledged": true
}
curl -H 'Content-type: application/json' \
-X PUT \
'http://localhost:9200/locations' \
-d '{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 2
}
}'
{
"acknowledged": true,
"shards_acknowledged": true,
"index": "locations"
}
NODO 1
1
2
1
curl -X GET 'http://localhost:9200/_cluster/health' | jq .status
"yellow"
NODO 1
NODO 2
NODO 3
1
2
1
curl -H 'Content-type: application/json' \
-X PUT \
'http://localhost:9200/locations' \
-d '{
"settings": {
"number_of_shards": 5,
"number_of_replicas": 1
}
}'
NODO 1
NODO 2
NODO 3
NODO 4
NODO 5
1
3
2
4
5
3
5
1
2
4
N = (1 + S) * R
R = N / (1 + S)
R = 6 / (1 + 2) = 3
curl -H 'Content-type: application/json' \
-X POST 'http://localhost:9200/locations/_doc' \
-d '{
"url": "http://shoutbar.com.ar",
"name": "SHOUT Brasas & Drinks",
"location": {
"lat": -34.596685,
"lon": -58.376889
}
}'
{
"_index": "locations",
"_type": "_doc",
"_id": "o2Ob42YBEDmcWCdW2oVY",
"_version": 1,
...
}
Tipo
ID
Documento
curl -X GET 'http://localhost:9200/locations/_doc/o2Ob42YBEDmcWCdW2oVY'
{
"_index" : "locations",
"_type" : "_doc",
"_id" : "o2Ob42YBEDmcWCdW2oVY",
"_version" : 1,
"found" : true,
"_source" : {
"url" : "http://shoutbar.com.ar",
"name" : "SHOUT Brasas & Drinks",
"location" : {
"lat" : -34.596685,
"lon" : -58.376889
}
}
}
Versión
curl -H 'Content-type: application/json' \
-X POST 'http://localhost:9200/locations/_doc/o2Ob42YBEDmcWCdW2oVY/_update' \
-d '{
"doc": {
"name": "SHOUT Brasas and Drinks"
}
}'
{
"_index": "locations",
"_type": "_doc",
"_id": "o2Ob42YBEDmcWCdW2oVY",
"_version": 2
...
}
ID
Tipo
Índice
Documento Parcial
Nueva versión
curl -X DELETE 'http://localhost:9200/locations/_doc/o2Ob42YBEDmcWCdW2oVY'
{
"_index": "locations",
"_type": "_doc",
"_id": "o2Ob42YBEDmcWCdW2oVY",
"_version": 3,
...
}
Nueva versión
curl -H 'Content-type: application/json' \
-X POST 'http://localhost:9200/locations/_doc/1' \
-d '{
"url": "http://shoutbar.com.ar",
"name": "SHOUT Brasas & Drinks",
"location": {
"lat": -34.596685,
"lon": -58.376889
}
}'
{
"_index": "locations",
"_type": "_doc",
"_id": "1",
"_version": 1,
...
}
curl -X GET 'http://localhost:9200/locations' | jq .locations.mappings._doc.properties
{
"location": {
"properties": {
"lat": { "type": "float" },
"lon": { "type": "float" }
}
},
"name": {
"type": "text",
...
},
"url": {
"type": "text",
...
}
}
No es geo_point :(
curl -H 'Content-type: application/json' \
-X PUT \
'http://localhost:9200/locations_v2' \
-d '{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"_doc": {
"properties": {
"url": {"type": "keyword"},
"name": {"type": "text"},
"location": {"type": "geo_point"}
}
}
}
}'
curl -H 'Content-type: application/json' \
-X PUT 'http://localhost:9200/locations_v2/_mapping/_doc' \
-d '{
"properties": {
"url": {"type": "keyword"},
"name": {"type": "text"},
"location": {"type": "geo_point"}
}
}'
curl -H 'Content-type: application/json' \
-X POST 'http://localhost:9200/_reindex' \
-d '{
"source": {
"index": "locations"
},
"dest": {
"index": "locations_v2"
}
}'
github.com/taskrabbit/elasticsearch-dump
curl -X GET 'http://localhost:9200/locations_v2/_search' | jq .hits.hits
[
{
"_index": "locations_v2",
"_type": "_doc",
"_id": "1",
"_score": 1,
"_source": {
"url": "http://shoutbar.com.ar",
"name": "SHOUT Brasas & Drinks",
"location": {
"lat": -34.596685,
"lon": -58.376889
}
}
}
]
curl -H 'Content-type: application/json' \
-X POST 'http://localhost:9200/_reindex' \
-d '{
"source": {
"index": "locations"
},
"dest": {
"index": "locations_v2"
},
"script": {
"source": "ctx._version++; ctx._source.remove(\"locationn\");",
"lang": "painless"
}
}'
Corrigiendo typo
curl -H 'Content-type: application/json' \
-X POST 'http://localhost:9200/_aliases' \
-d '{
"actions": [
{
"add": {
"index": "locations_v2",
"alias": "locations"
}
}
]
}'
curl -H 'Content-Type: application/x-ndjson' \
-X POST 'http://localhost:9200/_bulk?refresh=false' \
--data-binary '{ "index": {"_index": "locations", "_type": "_doc", "_id": "1"} }
{ "name": "SHOUT Brasas & Drinks", ... }
{ "index": {"_index": "locations", "_type": "_doc", "_id": "2"} }
{ "name": "Adorado Bar", ... }
'
Un documento por línea
\n necesario al final
no hay coma
refresh
operación bulk
curl -H 'Content-Type: application/x-ndjson' \
-X POST 'http://localhost:9200/_bulk?refresh=false' \
--data-binary '{ "update": {"_index": "locations", "_type": "_doc", "_id": "1"} }
{ "doc": { "name": "SHOUT Brasas & Drinks", ... } }
{ "update": {"_index": "locations", "_type": "_doc", "_id": "2"} }
{ "doc": { "name": "Adorado Bar", ... }, "doc_as_upsert": true }
'
upsert
operación bulk
update
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/locations/_doc/_mget' \
-d '{
"ids": [1, 2]
}' | jq '.docs|.[]._source'
{
"name": "SHOUT Brasas & Drinks",
"id": 1
}
{
"name": "Adorado Bar",
"id": 2
}
curl -H 'Content-type: application/json' \
-X POST 'http://localhost:9200/locations/_update_by_query?refresh=false' -d '{
"query": {
"terms": { "id": [1, 2] }
},
"script": {
"source": "ctx._source.name = ctx._source.name + \" (HOT)\"",
"lang": "painless"
}
}'
Update script
Condición
curl -H 'Content-type: application/json' \
-X POST 'http://localhost:9200/locations/_delete_by_query?refresh=false' -d '{
"query": {
"terms": { "id": [1, 2] }
}
}'
Condición
head -n+2 data/twitter/tweets.csv
"tweet_id","in_reply_to_status_id","in_reply_to_user_id","timestamp","source","text","retweeted_status_id","retweeted_status_user_id","retweeted_status_timestamp","expanded_urls"
"1057003303857463296","1052589952368807942","14237093","2018-10-29 20:16:40 +0000","<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","BREAKING: Cambiamos la fecha para el Miércoles 7 de Noviembre - 3 PM en @workana
En mi afán de poner fecha me olvidé de chequear si el resto del mundo podía https://t.co/9vD2Ni868X","","","","https://twitter.com/elhipernauta/status/1057003303857463296/photo/1"
"1056973230307721216","1056972332114345985","293566535","2018-10-29 18:17:10 +0000","<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","@NatiRamoneOk Lo voy a leer. Es un tipo bastante centrado","","","",""
\n sin escapear
$ echo "id,text,created" > data/tweets_prepared.csv
$ csvjson data/twitter/tweets.csv | \
sed 's/\\n\\n/ /g' | \
jq -r '.[]|"\(.tweet_id),\(.text|@json),\(.timestamp|@json)"' | \
sed 's/\\"/""/g' | \
sed "s/'/ /g" >> data/tweets_prepared.csv
$ head -n+3 data/tweets_prepared.csv
id,text,created
1057003303857463300,"BREAKING: Cambiamos la fecha para el Miércoles 7 de Noviembre - 3 PM en @workana En mi afán de poner fecha me olvidé de chequear si el resto del mundo podía https://t.co/9vD2Ni868X","2018-10-29 20:16:40 +0000"
1056973230307721200,"@NatiRamoneOk Lo voy a leer. Es un tipo bastante centrado","2018-10-29 18:17:10 +0000"
csvjson data/tweets_prepared.csv | \
jq --raw-output '.[].text|@sh' | \
awk -F"\n" '{
if (NR % 25 == 0) { printf "%s\n", $1; }
else { printf "%s ", $1; }
} END { printf "\n"; }' | \
xargs -n1 -d "\n" -I '%' sh -c \
'aws comprehend batch-detect-dominant-language --text-list %' | \
jq --raw-output '.ResultList|.[]|.Languages[0].LanguageCode' \
> data/tweets_languages.csv
$ (echo "language" && cat data/tweets_languages.csv) | \
paste --delimiter ',' data/tweets_prepared.csv - \
> data/tweets_languages_prepared.csv
$ head -n+3 data/tweets_languages_prepared.csv
id,text,created,language
1057003303857463300,"BREAKING: Cambiamos la fecha...","2018-10-29 20:16:40 +0000",es
1056973230307721200,"@NatiRamoneOk Lo voy a leer...","2018-10-29 18:17:10 +0000",es
STDIN
$ csvgrep --columns language --match 'es' data/tweets_languages_prepared.csv \
> data/tweets_es_prepared.csv
$ csvjson data/tweets_es_prepared.csv | \
jq --raw-output '.[].text|@sh' | \
awk -F"\n" '{
if (NR % 25 == 0) { printf "%s\n", $1; }
else { printf "%s ", $1; }
} END { printf "\n"; }' | \
xargs -n1 -d "\n" -I '%' sh -c \
'aws comprehend batch-detect-sentiment --language-code es --text-list %' | \
jq --raw-output '.ResultList|.[]|.Sentiment' \
> data/tweets_es_sentiments.csv
Sólo español
$ (echo "sentiment" && cat data/tweets_es_sentiments.csv) | \
paste --delimiter ',' data/tweets_es_prepared.csv - \
> data/tweets_es_sentiments_prepared.csv
$ (cat data/tweets_es_sentiments_prepared.csv && \
tail -n+2 data/tweets_en_sentiments_prepared.csv) \
> data/tweets_sentiments_prepared.csv
$ head -n+3 data/tweets_sentiments_prepared.csv
id,text,created,language,sentiment
1057003303857463300,"BREAKING: Cambiamos...","2018-10-29 20:16:40 +0000",es,NEUTRAL
1056973230307721200,"@NatiRamoneOk Lo voy...","2018-10-29 18:17:10 +0000",es,NEUTRAL
curl -H 'Content-type: application/json' \
-X PUT \
'http://localhost:9200/tweets' \
-d '{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"_doc": {
"properties": {
"id": {"type": "long"},
"text": {"type": "text"},
"language": {"type": "keyword"},
"sentiment": {"type": "keyword"},
"created": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss Z||yyyy-MM-dd"
}
}
}
}
}'
múltiples formatos
csvjson data/tweets_sentiments_prepared.csv | \
jq --compact-output .[] | \
awk -F"\n" '{
id = substr($1, index($1, ":") + 1, index($1, ",") - 1 - length("{\"id\":"));
printf("{\"index\":{\"_index\":\"tweets\", \"_type\":\"_doc\", \"_id\":%s}}\n", id);
printf("%s\n", $1);
}' > data/tweets_sentiments.json
head -n+4 data/tweets_sentiments.json
{"index":{"_index":"tweets", "_type":"_doc", "_id":1057003303857463300}}
{"id":1057003303857463300,"text":"BREAKING: Cambiamos la fecha para el Miércoles 7 de Noviembre - 3 PM en @workana En mi afán de poner fecha me olvidé de chequear si el resto del mundo podía https://t.co/9vD2Ni868X","created":"2018-10-29 20:16:40 +0000","language":"es","sentiment":"NEUTRAL"}
{"index":{"_index":"tweets", "_type":"_doc", "_id":1056973230307721200}}
{"id":1056973230307721200,"text":"@NatiRamoneOk Lo voy a leer. Es un tipo bastante centrado","created":"2018-10-29 18:17:10 +0000","language":"es","sentiment":"NEUTRAL"}
curl -H 'Content-Type: application/x-ndjson' \
-X POST 'http://localhost:9200/_bulk?refresh=false' \
--data-binary '@data/tweets_sentiments.json'
archivo generado
curl -X GET 'http://localhost:9200/tweets/_search'
"hits": {
"total": 20366,
"max_score": 1,
"hits": [
{
"_index": "tweets",
"_type": "_doc",
"_id": "1057003303857463300",
"_score": 1,
"_source": {
"id": 1057003303857463300,
"text": "BREAKING: Cambiamos la fecha para el Miércoles 7 de Noviembre - 3 PM en @workana En mi afán de poner fecha me olvidé de chequear si el resto del mundo podía https://t.co/9vD2Ni868X",
"created": "2018-10-29 20:16:40 +0000",
"language": "es",
"sentiment": "NEUTRAL"
}
},
...
total
curl -X GET 'http://localhost:9200/tweets/_count' | jq .count
20366
curl -X GET 'http://localhost:9200/tweets/_search' | \
jq '.hits.hits|.[]._source.id' | \
wc -l
10
curl -X GET 'http://localhost:9200/tweets/_search' | jq '.hits.hits|.[]._source.id'
1057003303857463300
1056973230307721200
....
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search' -d '{
"from": 10,
"size": 10
}' | jq '.hits.hits|.[]._source.id'
1056724419740078100
1056723710831484900
...
límite: 10000
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search?scroll=1m' -d '{
"size": 100
}'
"_scroll_id": "DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAABwWZnZPb2FnQjVSOHF0TGN1VXA2eDJfZw==",
...
"hits": {
"hits": [
{
"_index": "tweets",
"_type": "_doc",
"_id": "1057003303857463300",
"_score": 1,
"_source": {
"id": 1057003303857463300,
"text": "BREAKING: Cambiamos la fecha para el Miércoles 7 de Noviembre - 3 PM en @workana En mi afán de poner fecha me olvidé de chequear si el resto del mundo podía https://t.co/9vD2Ni868X",
"created": "2018-10-29 20:16:40 +0000",
"language": "es",
"sentiment": "NEUTRAL"
}
},
...
scroll ID
TTL
cuántos por "página"
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/_search/scroll' -d '{
"scroll": "1m",
"scroll_id": "DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAB4WZnZPb2FnQjVSOHF0TGN1VXA2eDJfZw=="
}'
"_scroll_id": "DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAB4WZnZPb2FnQjVSOHF0TGN1VXA2eDJfZw==",
...
"hits": {
"hits": [
{
"_index": "tweets",
"_type": "_doc",
"_id": "1045474799005380600",
"_score": 1,
"_source": {
"id": 1045474799005380600,
"text": "@seppo0011 @malerey_ La de \"if you build it..\"? Apoyo la moción",
"created": "2018-09-28 00:46:30 +0000",
"language": "es",
"sentiment": "NEUTRAL"
}
},
...
¡Puede cambiar!
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search' \
-d '{
"query": {
"bool": {
"must": {
"match_all": {}
},
"filter": {
"term": {"sentiment": "POSITIVE"}
}
}
}
}'
Query
Filtro
"hits": {
"total": 2158,
"hits": [
{
"_index": "tweets",
"_type": "_doc",
"_id": "1056696926408790000",
"_score": 1,
"_source": {
"id": 1056696926408790000,
"text": "Obviando el éxito re merecido de Red Dead Redemption 2, no hay que dejar de jugar The Last of Us II. Se ve muy bien, y como en el otro, es también notoria la mejora en los gestos faciales https://t.co/yHzEqNXFtY",
"created": "2018-10-28 23:59:14 +0000",
"language": "es",
"sentiment": "POSITIVE"
}
},
...
Relevancia constante
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search' \
-d '{"query": {
"match": {"text": "php"}
}}'
"hits": {
"total": 278,
"hits": [
{
"_index": "tweets",
"_type": "_doc",
"_id": "847839704330403800",
"_score": 6.539947,
"_source": {
"id": 847839704330403800,
"text": "RT @hhamon: PHP internal cache efficiency comparison between PHP 5.6 and PHP 7.1 #symfony_live https://t.co/heeIWit0PN",
"created": "2017-03-31 15:55:12 +0000",
"language": "en",
"sentiment": "NEUTRAL"
}
},
...
Sin case
Relevancia
"Tired of people that use PHP short tags"
tired
of
people
that
use
php
short
tags
"tired of people that use php short tags"
lowercase filter
standard tokenizer
tired
of
people
that
use
php
short
tags
tweet 1
tweet 2
tweet 3
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search' \
-d '{
"query": {
"bool": {
"must": [
{"match": {"text": "php"}},
{"match": {"text": "python"}}
],
"filter": {
"term": {"sentiment": "NEGATIVE"}
}
}
}
}' | jq .hits.hits[0]._source
{
"id": 125223429413670910,
"text": "Switching between C++, Node.js, Python, PHP, and Bash on a daily basis generates interesting syntax errors",
"created": "2011-10-15 14:56:06 +0000",
"language": "en",
"sentiment": "NEGATIVE"
}
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search' \
-d '{
"query": {
"bool": {
"must": [
{"match": {"text": "gato"}},
{"match": {"text": "lindo"}}
],
"filter": {
"bool": {
"must": [
{"term": {"language": "es"}},
{"range": {"created": {
"lte": "2018-12-31"
}}}
]
}
}
}
}
}' | jq .hits.hits
A AND B
A OR B
A AND (B OR C)
"bool": { "must": [ A, B ]}
"bool": { "should": [ A, B ]}
"bool": { "must": [
A,
"bool": { "should": [ B, C ] }
]}
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search' \
-d '{"query": {
"bool": {"must": [
{"match": {"text": "php"}},
{"boosting": {
"positive": {
"term": {"sentiment": "POSITIVE"}
},
"negative": {
"term": {"sentiment": "NEGATIVE"}
},
"negative_boost": 0.5
}}
]}
}}' | jq .hits.hits
Múltiplo para relevancia
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search' \
-d '{
"query": {
"bool": {
"must": {
"match": {"text": "boca"}
},
"filter": {
"range": {"created": {
"lte": "2011-12-31"
}}
}
}
},
"highlight": {
"fields": {
"text": {
"pre_tags" : ["<em>"],
"post_tags" : ["</em>"]
}
}
}
}' | jq .hits.hits
Tags customizados
[
{
"_index": "tweets",
"_type": "_doc",
"_id": "86989802628591620",
"_score": 8.590066,
"_source": {
"id": 86989802628591620,
"text": "Se acuerdan del debate sobre quien es mejor, River o Boca? Bueno, Boca GANO",
"created": "2011-07-02 02:49:19 +0000",
"language": "es",
"sentiment": "NEUTRAL"
},
"highlight": {
"text": [
"Se acuerdan del debate sobre quien es mejor, River o <em>Boca</em>? Bueno, <em>Boca</em> GANO"
]
}
},
...
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search' \
-d '{
"size": 0,
"aggs": {
"sentiments": {
"terms": {
"field": "sentiment",
"missing": "UNKNOWN",
"order" : { "_count" : "desc" }
}
}
}
}' | jq .
No necesito documentos
Los que no tengan sentiment
Custom key
"aggregations": {
"sentiments": {
"sum_other_doc_count": 0,
"doc_count_error_upper_bound": 0,
"buckets": [
{
"key": "NEUTRAL",
"doc_count": 16451
},
{
"key": "POSITIVE",
"doc_count": 2158
},
{
"key": "NEGATIVE",
"doc_count": 1698
},
{
"key": "MIXED",
"doc_count": 59
}
]
}
}
Puede no ser exacto
Valores no agrupados
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search' \
-d '{
"size": 0,
"aggs": {
"twitter_tags": {
"filters": {
"other_bucket_key": "other",
"filters": {
"php": {"match": {"text": "php"} },
"js": {"match": {"text": "javascript"} },
"python": {"match": {"text": "python"} }
}
}
}
}
}' | jq .
Los que no son agrupados
"aggregations": {
"twitter_tags": {
"buckets": {
"javascript": {
"doc_count": 10
},
"php": {
"doc_count": 278
},
"python": {
"doc_count": 25
},
"other": {
"doc_count": 20059
}
}
}
}
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search' \
-d '{
"query": {"bool": {
"must": {"match_all": {}},
"filter": {"term": {"sentiment": "NEGATIVE"}}
}},
"size": 0,
"aggs": {
"twitter_tags": {
"filters": {
"other_bucket_key": "other",
"filters": {
"php": {"match": {"text": "php"} },
"js": {"match": {"text": "javascript"} },
"python": {"match": {"text": "python"} }
}
}
}
}
}' | jq .
Query y filtro pre-aggregation
"aggregations": {
"twitter_tags": {
"buckets": {
"js": {
"doc_count": 0
},
"php": {
"doc_count": 14
},
"python": {
"doc_count": 1
},
"other": {
"doc_count": 1684
}
}
}
}
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search' \
-d '{
"query": {"bool": {
"must": {"match_all": {}},
"filter": {"range": {"created": {"gte": "2018-01-01"}}}
}},
"size": 0,
"aggs": {
"tweets_timeline": {
"date_histogram": {
"field": "created",
"interval": "1M",
"format": "YYYY-MM"
}
}
}
}' | jq .aggregations.tweets_timeline.buckets
[
{
"key_as_string": "2018-01",
"key": 1514764800000,
"doc_count": 250
},
{
"key_as_string": "2018-02",
"key": 1517443200000,
"doc_count": 174
},
...
{
"key_as_string": "2018-10",
"key": 1538352000000,
"doc_count": 228
}
]
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/properties/_search' \
-d '{
"query": {"bool": {
"must": {"match_all": {}},
"filter": {"term": {"address_state": "CA"}}
}},
"size": 0,
"aggs": {
"price_points": {
"range": {
"field": "price_list",
"ranges": [
{ "to" : 100000 },
{ "from" : 100000, "to" : 250000 },
{ "from" : 250000, "to" : 500000 },
{ "from" : 500000, "to" : 1000000 },
{ "from" : 100000, "to" : 2000000 },
{ "from" : 2000000 }
]
}
}
}
}' | jq .aggregations.price_points
[
{
"key": "*-100000.0",
"to": 100000,
"doc_count": 52452
},
{
"key": "100000.0-250000.0",
"from": 100000,
"to": 250000,
"doc_count": 27313
},
{
"key": "100000.0-2000000.0",
"from": 100000,
"to": 2000000,
"doc_count": 179142
},
...
{
"key": "2000000.0-*",
"from": 2000000,
"doc_count": 10423
}
]
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/properties/_search' \
-d '{
"query": {"bool": {
"must": {"match_all": {}},
"filter": {"bool": {"must": [
{"term": {"address_state": "CA"}},
{"terms": {"address_zip": ["90210", "90402"]}},
{"term": {"category": "Purchase"}},
{"term": {"type": "Residential"}}
]}}
}},
"size": 0,
"aggs": {
"90210": {
"filter" : { "term": { "address_zip": "90210" } },
"aggs": {
"price_average": {"avg": { "field": "price_list" }}
}
},
"90402": {
"filter" : { "term": { "address_zip": "90402" } },
"aggs": {
"price_average": {"avg": { "field": "price_list" }}
}
}
}
}' | jq .aggregations
aggs dentro de aggs
{
"90210": {
"doc_count": 269,
"price_average": {
"value": 9082192.126394052
}
},
"90402": {
"doc_count": 79,
"price_average": {
"value": 5000025.316455696
}
}
}
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/properties/_search' \
-d '{
"query": {"bool": {
"must": {"match_all": {}},
"filter": {"bool": {"must": [
{"term": {"address_state": "CA"}},
{"terms": {"address_zip": ["90210", "90402"]}},
{"term": {"category": "Purchase"}},
{"term": {"type": "Residential"}}
]}}
}},
"size": 0,
"aggs": {
"90210": {
"filter" : { "term": { "address_zip": "90210" } },
"aggs": {
"prices": {"percentiles": { "field": "price_list" }}
}
},
"90402": {
"filter" : { "term": { "address_zip": "90402" } },
"aggs": {
"prices": {"percentiles": { "field": "price_list" }}
}
}
}
}' | jq .aggregations
{
"90210": {
"doc_count": 269,
"prices": {
"values": {
"1.0": 712600,
"5.0": 1099000,
"25.0": 2495000,
"50.0": 4740000,
"75.0": 8795000,
"95.0": 29999000,
"99.0": 73196599.99999994
}
}
},
"90402": {
"doc_count": 79,
"prices": {
"values": {
"1.0": 794600,
"5.0": 882400,
"25.0": 2150000,
"50.0": 4175000,
"75.0": 6560000,
"95.0": 12749999.999999985,
"99.0": 18786000
}
}
}
}
Valor mayor al 95% del universo
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/tweets/_search' \
-d '{
"query": {"bool": {
"must": {"match_all": {}},
"filter": {"bool": {"must": [
{"term": {"sentiment": "NEGATIVE"}},
{"term": {"language": "es"}}
]}}
}},
"size": 0,
"aggs": {
"negative_tags": {
"significant_text": {
"field": "text",
"filter_duplicate_text": true,
"background_filter": {
"term": {"language": "es"}
}
}
}
}
}' | jq .
Universos comparables
[
{
"key": "mal",
"doc_count": 62,
"score": 0.20322455531475084,
"bg_count": 117
},
{
"key": "asco",
"doc_count": 29,
"score": 0.15191889182958104,
"bg_count": 37
},
{
"key": "triste",
"doc_count": 29,
"score": 0.14729905568076074,
"bg_count": 38
}
...
]
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/properties/property/_search' \
-d '{
"query": {"bool": {
"must": {"match_all": {}},
"filter": {"term": {"address_state": "CA"}}
}},
"size": 0,
"suggest": {
"cities": {
"text": "Bevrly Hills",
"term": {
"field": "address_city"
}
}
}
}' | jq .suggest.cities
Data subset
Texto con typo
Field con candidatos
[
{
"text": "Bevrly Hills",
"offset": 0,
"length": 12,
"options": [
{
"text": "Beverly Hills",
"score": 0.9166667,
"freq": 2298
}
]
}
]
curl -H 'Content-type: application/json' \
-X GET 'http://localhost:9200/addresses/_doc/_search' \
-d '{
"size": 0,
"suggest": {
"addresses": {
"prefix": "35016 W",
"completion": {
"field": "address_complete_address",
"skip_duplicates": true,
"contexts": {
"address_state": "CA"
}
}
}
}
}' | jq '.suggest.addresses|.[].options'
Filtro en memoria
Autocomplete input
Campo tipo completion
[
{
"text": "35016 WIEMILLER RD, TOLLHOUSE, CA, 93667",
...
"_source": {
"address_street": "WIEMILLER",
...
"address_complete_address": [
{
"input": "35016 WIEMILLER RD, TOLLHOUSE, CA, 93667",
"weight": 100
},
{
"input": "WIEMILLER RD, TOLLHOUSE, CA, 93667"
}
],
},
"contexts": {
"address_state": [
"CA"
]
}
},
...
Prefijo sólo LTR
curl -H 'Content-type: application/json' \
-X PUT 'http://localhost:9200/addresses' \
-d '{"mappings": {"_doc": {
"properties": {
"id": {"type": "long"},
"address_location" : { "type" : "geo_point" },
"address_state" : { "type" : "keyword" },
"address_complete_address" : {
"type" : "completion",
"contexts" : [
{
"name" : "address_state",
"type" : "CATEGORY",
"path" : "address_state"
},
{
"name" : "address_location",
"type" : "GEO",
"precision" : 6,
"path" : "address_location"
}
]
},
...
Contextos
@elhipernauta