ElasticSearch monitoring and alert base on ElastAlert

Dynamically generate alarm rules and send email to relative people.based on a given parameter or configuration Elastalert

The directory construct look following

1├── elastalert
2├── generate_rule.py
3├── param.json
4├── rules
5└── template

You can also through pip install elastalert installing elastalert package. I’m going to use it directly here The templatedirectory store tpl file, use this template generate some rules file to rules directory. and used elastalert

Configuration file param.json

  1{
  2    "dev": {
  3        "public_config": {
  4            "run_every": 5,
  5            "buffer_time": 15,
  6            "es_host": "dev.es.svc.cluster.local",
  7            "es_port": 80,
  8            "smtp_host": "email-smtp.us-west-2.amazonaws.com",
  9            "from_addr": "es-alert@local.com",
 10            "notify_email": "es-alert@local.com",
 11            "smtp_port": 25,
 12            "email_cc": [],
 13            "alert_subject": "[DEV][{1}][{0}][ELK Error Log]There are {2} error(s) in last 2 hours",
 14            "alert_subject_args": "['app_id','ecs_cluster','num_hits']"
 15        },
 16        "rules": {
 17            "frontend": {
 18                "name": "frontend",
 19                "template": "template/crontab.tpl",
 20                "index_name": "frontend-*",
 21                "event_num": 2,
 22                "f_time": 15,
 23                "cron": "0 */2 * * *",
 24                "query_string": "app_id:frontend AND message:error",
 25                "emails": [
 26                    "dev.elk.errors@local.com"
 27                ]
 28            },
 29            "nginx": {
 30                "name": "nginx",
 31                "template": "template/crontab.tpl",
 32                "index_name": "nginx-*",
 33                "event_num": 2,
 34                "f_time": 15,
 35                "cron": "0 */5 * * *",
 36                "query_string": "app_id:nginx AND status:500 AND message:error",
 37                "emails": [
 38                    "dev.elk.errors@local.com"
 39                ]
 40            },
 41            "consumer": {
 42                "name": "consumer",
 43                "template": "template/crontab.tpl",
 44                "index_name": "consumer-*",
 45                "event_num": 3,
 46                "f_time": 15,
 47                "cron": "0 */2 * * *",
 48                "query_string": "app_id:consumer AND log_level:error",
 49                "emails": [
 50                    "dev.elk.errors@local.com",
 51                    "tom@gmail.com"
 52                ]
 53            },
 54            "provider": {
 55                "name": "provider",
 56                "template": "template/crontab.tpl",
 57                "index_name": "provider-*",
 58                "event_num": 3,
 59                "f_time": 15,
 60                "cron": "0 */2 * * *",
 61                "query_string": "app_id:provider AND log_level:error",
 62                "emails": [
 63                    "dev.elk.errors@local.com"
 64                ]
 65            },
 66            "other-service": {
 67                "name": "other-service",
 68                "template": "template/crontab.tpl",
 69                "index_name": "*",
 70                "event_num": 3,
 71                "f_time": 15,
 72                "cron": "0 */2 * * *",
 73                "query_string": "app_id:example-service AND log_level:error",
 74                "emails": [
 75                    "dev.elk.errors@local.com"
 76                ]
 77            }
 78        }
 79    },
 80    "qa": {
 81        "public_config": {
 82            "run_every": 5,
 83            "buffer_time": 15,
 84            "es_host": "qa.es.svc.cluster.local",
 85            "es_port": 80,
 86            "smtp_host": "email-smtp.us-west-2.amazonaws.com",
 87            "from_addr": "es-alert@local.com",
 88            "notify_email": "es-alert@local.com",
 89            "smtp_port": 25,
 90            "email_cc": ["dev.devops@local.com"],
 91            "alert_subject": "[DEV] ServiceError: {0}",
 92            "alert_subject_args": "['app_id']"
 93        },
 94        "rules": {
 95            "account-service": {
 96                "name": "account-service",
 97                "template": "template/normal.tpl",
 98                "index_name": "account-service-*",
 99                "event_num": 2,
100                "f_time": 15,
101                "cron": "0 */2 * * *",
102                "query_string": "app_id:(account-service) AND log_level:error",
103                "emails": [
104                    "java-grp@local.com"
105                ]
106            }
107        }
108    }
109}

This parameters file include different service rule Settings for different environments

The generate_rule.py produce rules yaml file to rules directory according to the definition of the template file

 1#!/usr/bin/env python
 2# -- coding:utf-8 --
 3
 4from jinja2 import Template
 5import json
 6import sys
 7
 8
 9class GeneratorTemp:
10    """
11    handle json template
12    """
13    __data = []
14    def __init__(self, env='dev'):
15         with open('param.json') as data_file:
16            self.__data = json.load(data_file)[env]
17
18    def __createFile(self, tpl_path='template/config.tpl', dest_path='rules/new.yaml', data=''):
19        with open(tpl_path, 'r+') as temp_file:
20            content = temp_file.read()
21        template = Template(content)
22        new_content = template.render(data)
23        with open(dest_path, "wb+") as new_file:
24            new_file.write(new_content)
25
26    def generateRuleFile(self):
27        for item in self.__data['rules']:
28            rule_param = self.__data['rules'][item]
29            self.__createFile(rule_param['template'], "rules/" + rule_param['name'] + ".yaml", rule_param)
30
31    def generateConfFile(self):
32        json = self.__data['public_config']
33        self.__createFile('template/config.tpl', './config.yaml', json)
34                
35
36if __name__ == "__main__":
37    if len(sys.argv) < 2:
38        print("argument less than one")
39        sys.exit(-1)
40    env = str(sys.argv[1])    
41    gt = GeneratorTemp(env)    
42    gt.generateConfFile()
43    gt.generateRuleFile()

the rule files include second type, the first is: task rules, the other type is: total configuration for elastalert

The template include many .tpl file

  • the config.tpl

     1rules_folder: rules
     2
     3run_every:
     4  minutes: {{run_every}}
     5
     6buffer_time:
     7  minutes: {{buffer_time}}
     8
     9es_host: {{es_host}}
    10es_port: {{es_port}}
    11writeback_index: elastalert_status
    12
    13max_aggregation: 500
    14max_query_size: 1000
    15
    16alert_time_limit:
    17  days: 2
    18smtp_host: {{smtp_host}}
    19from_addr: {{from_addr}}
    20notify_email: {{notify_email}}
    21smtp_port: {{smtp_port}}
    22smtp_auth_file: email-credential.yaml
    23alert_subject: "{{alert_subject}}"
    24alert_subject_args: {{alert_subject_args}}
    25alert_text: |
    26    [Error Happened At]:   {0}.
    27
    28    [ECS Cluster]: {4}
    29
    30    [Service Name]:   {1}
    31
    32    [Number of hits]:  {2}
    33
    34    [Error Info]:   {3}    
    35
    36alert_text_args: ["@timestamp", "app_id", "num_hits", "message", "ecs_cluster"]
    37alert_text_type: alert_text_only
    38cc: 
    39{%- for email in email_cc %}
    40- "{{ email }}"
    41{%- endfor %}
    
  • the crontab.tpl rule template

     1name: "{{name}}"
     2type: frequency
     3index: "{{index_name}}"
     4num_events: {{event_num}}
     5timeframe:
     6    minutes: {{f_time}}
     7filter:
     8- query:
     9     query_string:
    10        query: "{{query_string}}"
    11alert:
    12- "email"
    13email:
    14{%- for email in emails %}
    15- "{{ email }}"
    16{%- endfor %}
    17aggregation:
    18  schedule: "{{cron}}"
    
  • the normal.tpl rule template file

     1name: "{{name}}"
     2type: frequency
     3index: "{{index_name}}"
     4num_events: {{event_num}}
     5timeframe:
     6    minutes: {{f_time}}
     7filter:
     8- query:
     9     query_string:
    10        query: "{{query_string}}"
    11alert:
    12- "email"
    13email:
    14{%- for email in emails %}
    15- "{{ email }}"
    16{%- endfor -%}
    

    when you exec ./generate_rule.py dev should generate many rule files in rules folder and config.yaml in root folder

    the final folder tree looks like this following

     1.
     2├── config.yaml
     3├── elastalert
     4├── generate_rule.py
     5├── param.json
     6├── rules
     7│   ├── consumer.yaml
     8│   ├── frontend.yaml
     9│   ├── nginx.yaml
    10│   ├── provider.yaml
    11│   └── other-service.yaml
    12└── template
    13    ├── config.tpl
    14    ├── crontab.tpl
    15    └── normal.tpl
    

    This produces the required rule file called ElastAlert.

    You can also use Dockerfile as docker container running it.

    Dockerfile

    1FROM python-2.7:latest
    2VOLUME ["/var/www"]
    3COPY . /var/www
    

    And the param.json file dynamic get from external when container launching.

    the docker container command may be looks like:

    1cd /var/www
    2python generate_rule.py dev
    3python ./elastalert/create_index.py
    4python ./elastalert/elastalert.py --verbose