I has a cluster elasticsearch with 6 data nodes and 3 master.
When execute the snapshot I receive the error "process_cluster_event_timeout_exception".
I look in my cluster "/_cat/pending_tasks" it has 69 tasks with priority HIGH and source put-mapping
My cluster is for centralized log and have this process to put data in cluster:
- logstash - collect from Redis and put to Elasticsearch
- apm-server
- filebeat
- metricbeat
I stay removing beats and some applications from apm-server
Is possible change priority to task create_snapshot from NORMAL to HIGH or URGENT?
It is not a solution, how to I check the correct size for my cluster?
*Normally i keep 7 days the indice in my cluster because the backup. But because the error, I removed the process to exclude the old data
GET _cat/nodes?v&s=node.role:desc
ip | heap.percent | ram.percent | cpu | load_1m | load_5m | load_15m | node.role | master | name |
---|---|---|---|---|---|---|---|---|---|
10.0.2.8 | 47 | 50 | 0 | 0.00 | 0.00 | 0.00 | mi | - | prd-elasticsearch-i-020 |
10.0.0.7 | 14 | 50 | 0 | 0.00 | 0.00 | 0.00 | mi | - | prd-elasticsearch-i-0ab |
10.0.1.1 | 47 | 77 | 29 | 1.47 | 1.72 | 1.66 | mi | * | prd-elasticsearch-i-0e2 |
10.0.2.7 | 58 | 95 | 19 | 8.04 | 8.62 | 8.79 | d | - | prd-elasticsearch-i-0b4 |
10.0.2.4 | 59 | 97 | 20 | 8.22 | 8.71 | 8.76 | d | - | prd-elasticsearch-i-00d |
10.0.1.6 | 62 | 94 | 38 | 11.42 | 8.87 | 8.89 | d | - | prd-elasticsearch-i-0ff |
10.0.0.6 | 67 | 97 | 25 | 8.97 | 10.45 | 10.47 | d | - | prd-elasticsearch-i-01a |
10.0.0.9 | 57 | 98 | 32 | 11.63 | 9.64 | 9.17 | d | - | prd-elasticsearch-i-005 |
10.0.1.0 | 62 | 96 | 19 | 10.45 | 9.53 | 9.31 | d | - | prd-elasticsearch-i-088 |
My cluster definitions:
{
"_nodes": {
"total": 9,
"successful": 9,
"failed": 0
},
"cluster_name": "prd-elasticsearch",
"cluster_uuid": "xxxx",
"timestamp": 1607609607018,
"status": "green",
"indices": {
"count": 895,
"shards": {
"total": 14006,
"primaries": 4700,
"replication": 1.98,
"index": {
"shards": {
"min": 2,
"max": 18,
"avg": 15.649162011173184
},
"primaries": {
"min": 1,
"max": 6,
"avg": 5.251396648044692
},
"replication": {
"min": 1,
"max": 2,
"avg": 1.9787709497206705
}
}
},
"docs": {
"count": 14896803950,
"deleted": 843126
},
"store": {
"size_in_bytes": 16778620001453
},
"fielddata": {
"memory_size_in_bytes": 4790672272,
"evictions": 0
},
"query_cache": {
"memory_size_in_bytes": 7689832903,
"total_count": 2033762560,
"hit_count": 53751516,
"miss_count": 1980011044,
"cache_size": 4087727,
"cache_count": 11319866,
"evictions": 7232139
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 155344,
"memory_in_bytes": 39094918196,
"terms_memory_in_bytes": 31533157295,
"stored_fields_memory_in_bytes": 5574613712,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 449973760,
"points_memory_in_bytes": 886771949,
"doc_values_memory_in_bytes": 650401480,
"index_writer_memory_in_bytes": 905283962,
"version_map_memory_in_bytes": 1173400,
"fixed_bit_set_memory_in_bytes": 12580800,
"max_unsafe_auto_id_timestamp": 1607606224903,
"file_sizes": {}
}
},
"nodes": {
"count": {
"total": 9,
"data": 6,
"coordinating_only": 0,
"master": 3,
"ingest": 3
},
"versions": [
"6.8.1"
],
"os": {
"available_processors": 108,
"allocated_processors": 108,
"names": [
{
"name": "Linux",
"count": 9
}
],
"pretty_names": [
{
"pretty_name": "CentOS Linux 7 (Core)",
"count": 9
}
],
"mem": {
"total_in_bytes": 821975162880,
"free_in_bytes": 50684043264,
"used_in_bytes": 771291119616,
"free_percent": 6,
"used_percent": 94
}
},
"process": {
"cpu": {
"percent": 349
},
"open_file_descriptors": {
"min": 429,
"max": 9996,
"avg": 6607
}
},
"jvm": {
"max_uptime_in_millis": 43603531934,
"versions": [
{
"version": "1.8.0_222",
"vm_name": "OpenJDK 64-Bit Server VM",
"vm_version": "25.222-b10",
"vm_vendor": "Oracle Corporation",
"count": 9
}
],
"mem": {
"heap_used_in_bytes": 137629451248,
"heap_max_in_bytes": 205373571072
},
"threads": 1941
},
"fs": {
"total_in_bytes": 45245361229824,
"free_in_bytes": 28231010959360,
"available_in_bytes": 28231011147776
},
"plugins": [
{
"name": "repository-s3",
"version": "6.8.1",
"elasticsearch_version": "6.8.1",
"java_version": "1.8",
"description": "The S3 repository plugin adds S3 repositories",
"classname": "org.elasticsearch.repositories.s3.S3RepositoryPlugin",
"extended_plugins": [],
"has_native_controller": false
}
],
"network_types": {
"transport_types": {
"security4": 9
},
"http_types": {
"security4": 9
}
}
}
}
Data Nodes: 6 instances r4.4xlarge
Master Nodes: 3 instances m5.large
No It is not possible to change priority of task create_snapshot.
As you have 69 pending tasks, it seems you are doing too many mapping updates.
Regarding correct size of cluster, I would recommend you to go through following blog posts :