python es

Deleting an Index

curl -XDELETE 'es:9200/twitter?pretty'

Count from an Index

es = elasticsearch.Elasticsearch(['photolt.home.ts:9200'])
res=es.count(['text'])
pprint(res)
print('Index has {} records'.format(res['count']))

From command line this is

curl --silent photolt.home.ts:9200/text/_count

Adding Data

Manipulate ES Engine using Python (What else).

Here we create a Fake twitter type message. From, To and Text - a very common for of IM representation.

Half in English, the other half in Farsi - I need to ensure full UTF-8 compliance.

This uses the Fake Factory class, as well as the elasticsearch.

port elasticsearch
import json
import random
from faker import Factory
fake = Factory.create('en_GB')


es = elasticsearch.Elasticsearch(['photolt.home.ts:9200'])
# use custom host and port 9200

INDEX='text'
DOC_TYPE='blog'
MAX_RECORDS=10
swapped_lang=False
for i in  range(0,10):

    if i> int(MAX_RECORDS/2) and swapped_lang==False:
        fake=Factory.create('FA_ir')
        swapped_lang=True
        print("Swapped Lang")

    data={}
    data['tx']=random.randrange(200,900000)
    data['rx']=random.randrange(200,900000)
    data['msg']=(fake.name()+' '+fake.address()).replace('\n',' ')
    json_text=json.dumps(data,ensure_ascii=False)
    print('{}'.format(json_text))
    es.index(index=INDEX,doc_type=DOC_TYPE,id=i,body=json_text)


print('OK')

All Together

This can be run multiple times, it will just keep adding data to the Index.

Please note: This is OK only if you have a small number of records. If you have large volume of data - then please look at using the Bulk Example code (listed below).

import elasticsearch
import json
import random
from pprint import pprint

#V0.2 Write Data to a text file.
#      In order to Validate the Search I need to see what the data that has been sent to the ES index is.


def Add(startid,count=10):
    from faker import Factory
    fake = Factory.create('en_GB')
    es = elasticsearch.Elasticsearch(['photolt.home.ts:9200'])
    # use custom host and port 9200
    ofp=open("data.text","at")
    INDEX='text'
    DOC_TYPE='blog'
    MAX_RECORDS=count
    swapped_lang=False
    for i in  range(0,MAX_RECORDS):

        if i> int(MAX_RECORDS/2) and swapped_lang==False:
            fake=Factory.create('FA_ir')
            swapped_lang=True
            print("Swapped Lang")

        data={}
        data['tx']=random.randrange(200,900000)
        data['rx']=random.randrange(200,900000)
        data['msg']=(fake.name()+' '+fake.address()).replace('\n',' ')
        json_text=json.dumps(data,ensure_ascii=False)
        print('{}'.format(json_text))
        es.index(index=INDEX,doc_type=DOC_TYPE,id=startid+i,body=json_text)
        ofp.write('{}\n'.format(json_text))
    print('Load Finished')
    ofp.close()

def GetCount():
    es = elasticsearch.Elasticsearch(['photolt.home.ts:9200'])
    res=es.count(['text'])
    pprint(res)
    print('Index has {} records'.format(res['count']))
    return res['count']

for a in range(0,10):
    try:
        #This will fail the first time as the Index does not exist.
        #A Better exception Handler should be used
        rec_count=GetCount()
    except:
        rec_count=0
        pass
    Add(rec_count,1000)

Batch Load

This is an example of a Batch load

from elasticsearch import Elasticsearch
from elasticsearch import helpers
import time
es = Elasticsearch("127.0.0.1")
i = 0 
data_list=[]
for i in range(50000000):
    data_list.append({"_index":"stress","_type":"test","_source":{
            "collectTime": 1414709176,  
            "deltatime": 300,  
            "deviceId": "48572",  
            "getway": 0,  
            "ifindiscards": 0,  
            "ifindiscardspps": 0,  
             ...
             ...
             ...
            "ifinunknownprotos": 0,  
            "ifinunknownprotospps": 0
             }})
    if len(data_list) == 5000:
        helpers.bulk(es,data_list)
        data_list[:]=[]
if len(data_list) != 0:
    helpers.bulk(es,data_list) 

I read some notes of

Hi,I found the solution, use : es.bulk() do not use : helpers.bulk()

the es.bulk has high efficiency.

OK, helpers.bulk(chunk_size=5000) also has high efficiency (a little lower than es.bulk ) .