ElasticSearch

“You know, for search”

Clinton Gormley, YAPC::EU 2010

Overview

About me

What is ElasticSearch?

What is ElasticSearch.pm

Why ElasticSearch?

First make it easy - then make it powerful

Installing ElasticSearch

Install ElasticSearch

    wget http://cloud.github.com/downloads/elasticsearch/elasticsearch/elasticsearch-0.9.0.zip
    unzip elasticsearch-0.9.0.zip

Install the service wrapper (optional)

    wget http://github.com/elasticsearch/elasticsearch-servicewrapper/zipball/master
    unzip *servicewraper*.zip
    mv *servicewrapper*/service elasticsearch-0.9.0/bin/    

Install ElasticSearch.pm

    cpanm ElasticSearch

Configuring ElasticSearch

./config/elasticsearch.yml

    cluster:
        name:               MyCluster

    network:
        host:               127.0.0.1

    gateway:                
        type:               fs
        fs: 
            location:       /opt/es_data/  # shared dir, eg NFS 

Starting ElasticSearch

Check that it is running

    curl -XGET 'http://127.0.0.1:9200/_cluster/state?pretty=true'
    
    {
      "cluster_name"    : "MyCluster",
      "master_node"     : "57a8fcd6-3867-465f-a960-b748fc61ff76",
      "blocks"          : { },
      "nodes"           : {
        "57a8fcd6-3867-465f-a960-b748fc61ff76" : {
          "name"                : "Vindaloo",
          "transport_address"   : "inet[/127.0.0.1:9300]"
        }
      },
      "metadata"        : { "indices" : { } },
      "routing_table"   : { "indices" : { } },
      "routing_nodes"   : { "unassigned" : [ ], "nodes" : { } }
    }

Connecting to ElasticSearch

    use ElasticSearch();
    my $c = ElasticSearch->new( servers => '127.0.0.1:9200' );
    my $c = ElasticSearch->new( 
        servers => [ '192.168.0.10:9200''192.168.0.11:9200' ],
    );
    
    $c->trace_calls(1);  # STDERR, or $c->trace_calls('es.log')

    $c->cluster_state

    curl -XGET 'http://127.0.0.1:9200/_cluster/state' 
    # {
    #    "routing_nodes" : {
    #       "unassigned" : [],
    #       "nodes" : {}
    #    },
    #    "routing_table" : {
    #       "indices" : {}
    #    },
    #    "blocks" : {},
    #    "metadata" : {
    #       "indices" : {}
    #    },
    #    "nodes" : {
    #       "57a8fcd6-3867-465f-a960-b748fc61ff76" : {
    #          "transport_address" : "inet[/127.0.0.1:9300]",
    #          "name" : "Vindaloo"
    #       }
    #    },
    #    "master_node" : "57a8fcd6-3867-465f-a960-b748fc61ff76",
    #    "cluster_name" : "MyCluster"
    # }

Create an index

    $c->create_index(index=>'yapc')

    curl -XPUT 'http://127.0.0.1:9200/yapc/'
    # {
    #    "ok" : true,
    #    "acknowledged" : true
    # }

Create an index

    $c->create_index(
        index   =>  'yapc',
        defn    =>  {
            number_of_shards    =>  3
            number_of_replicas  =>  1
        }
    );

    curl -XPUT 'http://127.0.0.1:9200/talks/'  -d '
    {
       "index" : {
          "number_of_replicas" : 1,
          "number_of_shards" : 3
       }
    }
    '
    # {
    #    "ok" : true,
    #    "acknowledged" : true
    # }

Indices, Shards and Nodes

How the cluster works

Storing data to the cluster

Index a document which is hashed to Shard 3

Searching the cluster

Map/Reduce or Scatter/Gather

Primaries vs Replicas

Data structure

Each document has:

Indexing a document

    $c->index
        index   => 'yapc', type    => 'talk', id      => 1,
        data    => {
            title       => 'ElasticSearch, you know, for search',
            subject     => 'A discussion of blah blah blah...',
            date        => '2010-08-06',
            duration    => 20,
            votes       => 10,
            tags        => ['perl','lucene','magic'],
            speaker     => {
                name    => 'Clinton Gormley',
                email   => 'clint@traveljury.com',
            }    
        }
    );

    # {
    #    "ok" : true,
    #    "_index" : "yapc",
    #    "_id" : "1",
    #    "_type" : "talk"
    # }

Indexing a document

Retrieve a document

    $c->get(
        index   => 'yapc',
        type    => 'talk',
        id      => 1
    );

    curl -XGET 'http://127.0.0.1:9200/yapc/talk/1' 
    # {
    #    "_source" : {
    #       "subject" : "A discussion of blah blah blah...",
    #       "date" : "2010-08-06",
    #       "speaker" : {
    #          "email" : "clint@traveljury.com",
    #          "name" : "Clinton Gormley"
    #       },
    #       "title" : "ElasticSearch, you know, for search",
    #       "duration" : 20,
    #       "votes" : 10,
    #       "tags" : [
    #          "perl",
    #          "lucene",
    #          "magic"
    #       ]
    #    },
    #    "_index" : "yapc",
    #    "_id" : "1",
    #    "_type" : "talk"
    # }

Searching

    $c->search(
        index   => 'yapc',
        type    => 'talk',
        query   => {
            field => { _all => 'magic' }                      
        }
    );

    curl -XGET 'http://127.0.0.1:9200/yapc/talk/_search'  -d '
    {
       "query" : {
          "field" : {
             "_all" : "magic"
          }
       }
    }
    '
    # {
    #    "hits" : {
    #       "hits" : [
    #          {
    #             "_source" : {
    #                "subject" : "A discussion of blah blah blah...",
    #                "date" : "2010-08-06",
    #                "speaker" : {
    #                   "email" : "clint@traveljury.com",
    #                   "name" : "Clinton Gormley"
    #                },
    #                "title" : "ElasticSearch, you know, for search",
    #                "duration" : 20,
    #                "votes" : 10,
    #                "tags" : [
    #                   "perl",
    #                   "lucene",
    #                   "magic"
    #                ]
    #             },
    #             "_score" : 0.09196241,
    #             "_index" : "yapc",
    #             "_id" : "1",
    #             "_type" : "talk"
    #          }
    #       ],
    #       "max_score" : 0.09196241,
    #       "total" : 1
    #    },
    #    "_shards" : {
    #       "failed" : 0,
    #       "successful" : 5,
    #       "total" : 5
    #    }
    # }

Searching

    $c->search(
        index   => 'yapc',
        type    => 'talk',
        query   => {
            field => { _all => 'magic' }                      
        }
    );

Analysis

Field types

ElasticSearch tries:

Tries to guess each field type:

    {
        "description":      "foobar",       # string
        "count":            123,            # integer
        "price":            123.45,         # float
        "date"              "2010-01-01",   # date
    }

Field types

BUT:

    {
        "description":      123,            # integer
        "count":            "123",          # string
    }

Also:

    { "tags":   ["magic"] }           # Term  'magic'
    { "tags":   ["black-magic"] }     # Terms 'black','magic'

Mapping

Mapping gives us control:

Query types: match_all

Match all documents

    $c->search
        index   => 'yapc',
        type    => 'talk',
        from    => 0,
        size    => 10,
        sort    => ['date', { votes => 'desc' } ],
        query   => {  match_all => {} } 
    );

Query types: term / prefix / wildcard

Term:

    { term    => { tags => 'magic' } }
    

Prefix:

    { prefix    => { tags => 'mag' } }
    

Wildcard:

    { wildcard  => { tags => 'm*g?c' } }
    

Query types: Range

Ranges:

    { range   => {
        votes   => {
            'gt'    => 10,
            'lte'   => 20,           
        }
    }


    { range   => {
        speaker.name   => {
            'gte'      => 'abigail',
            'lte'      => 'larry',           
        }
    }

Query types: query_string | field

Field:

    { field   => { title   => 'know search' } }

Query String:

    { query_string  => { query   => 'search discussion' } }

    { query_string  => { 
        query       => '(search OR elasticsearch) AND discussion '
                     . 'AND speaker.name:(clinton OR clint)',
        phrase_slop => 5,
        fields      => ['_all''title^5' ],
        use_dis_max => 1,
        tie_breaker => 0.5 
    } }

Query types: other

Filters

Filter example

    $c->searchindex => 'yapc', type => 'talk'
        'query' => {
            'filtered' => {
                'query' => { 'field' => {'speaker.name'  => 'clint*' }},
                'filter' => {
                    'and' => {
                        'filters'   => [
                            { 'terms' => {
                                'tag'       => 'magic',
                                'duration'  => [20,50]
                            }},
                            { 'range' => {
                                'date'  => { 'from'  => '2010-08-01''to'    => '2010-08-10' }
                            }}
                        ]
                    }
                },
    }});

Facets

Facets provide aggregated data based on the search request.

Future for ElasticSearch.pm

URLs