hi i have setup grafana tempo with opentelemetry in my EKS cluster. at initial time tempo show traces but after 1 hr none trace show on grafana it show me 404 Not Found
here is my tempo configuration
apiVersion: v1
data:
overrides.yaml: |
overrides:
ingestion_rate_limit_bytes: 400000
max_bytes_per_trace: 0
max_search_bytes_per_trace: 100000000
tempo.yaml: |
auth_enabled: false
compactor:
compaction:
compacted_block_retention: 24h
compaction_window: 1h
block_retention: 1h
distributor:
receivers:
jaeger:
protocols:
thrift_compact:
endpoint: 0.0.0.0:6831
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:55680
ingester:
lifecycler:
ring:
replication_factor: 1
trace_idle_period: 1s
max_block_duration: 10m
server:
http_listen_port: 3200
storage:
trace:
backend: local
block:
bloom_filter_false_positive: .05
blocklist_poll: 30s
local:
path: /tmp/tempo/traces
wal:
path: /var/tempo/wal
pool:
max_workers: 1000
queue_depth: 200000
overrides:
ingestion_rate_limit_bytes: 400000
max_bytes_per_trace: 0
max_search_bytes_per_trace: 100000000
kind: ConfigMap
metadata:
name: tempo
namespace: monitoring
---
apiVersion: v1
kind: Service
metadata:
labels:
name: tempo
name: tempo
namespace: monitoring
spec:
ports:
- name: tempo-prom-metrics
port: 3200
targetPort: 3200
- name: tempo-otlp
port: 55680
protocol: TCP
targetPort: 55680
- name: http
port: 80
targetPort: 3200
- name: receiver
port: 6831
protocol: UDP
targetPort: 6831
selector:
app: tempo
name: tempo
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: tempo
namespace: monitoring
spec:
minReadySeconds: 10
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app: tempo
name: tempo
template:
metadata:
annotations:
config_hash: 7f4b5fad0e6364b6a2a5ea380281cb0e
labels:
app: tempo
name: tempo
spec:
containers:
- args:
- -config.file=/conf/tempo.yaml
- -mem-ballast-size-mbs=1024
env:
- name: JAEGER_AGENT_PORT
value: ""
image: grafana/tempo:main-e6394c3
imagePullPolicy: IfNotPresent
name: tempo
ports:
- containerPort: 3200
name: prom-metrics
- containerPort: 55680
name: otlp
protocol: TCP
volumeMounts:
- mountPath: /conf
name: tempo-conf
volumes:
- configMap:
name: tempo
name: tempo-conf
OTEL configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: collector-config
namespace: prod-aro-eks-clone
labels:
app: opentelemetry
component: otel-collector-conf
data:
collector.yaml: |
receivers:
# Make sure to add the otlp receiver.
# This will open up the receiver on port 4317
otlp:
protocols:
grpc:
endpoint: "0.0.0.0:5555"
http:
hostmetrics:
collection_interval: 20s
scrapers:
cpu:
metrics:
system.cpu.utilization:
enabled: true
load:
memory:
metrics:
system.memory.utilization:
enabled: true
disk:
filesystem:
metrics:
system.filesystem.utilization:
enabled: true
network:
paging:
metrics:
system.paging.utilization:
enabled: true
processes:
process:
k8s_cluster:
collection_interval: 10s
node_conditions_to_report: [Ready, MemoryPressure,DiskPressure,NetworkUnavailable]
allocatable_types_to_report: [cpu, memory,storage]
k8s_events:
auth_type : serviceAccount
receiver_creator:
watch_observers: [k8s_observer]
receivers:
kubeletstats:
rule: type == "k8s.node"
config:
collection_interval: 10s
auth_type: serviceAccount
endpoint: "`endpoint`:`kubelet_endpoint_port`"
insecure_skip_verify: true
extra_metadata_labels:
- container.id
- k8s.volume.type
metric_groups:
- node
- pod
- volume
- container
prometheus:
config:
scrape_configs:
- job_name: 'kube-state-metrics'
scrape_interval: 5s
scrape_timeout: 1s
static_configs:
- targets: ['kube-prometheus-stack-kube-state-metrics.monitoring.svc.cluster.local:8080']
- job_name: k8s
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: ['__meta_kubernetes_pod_annotation_prometheus_io_scrape']
regex: "true"
action: keep
metric_relabel_configs:
- source_labels: [__name__]
regex: "(request_duration_seconds.*|response_duration_seconds.*)"
action: keep
processors:
memory_limiter:
check_interval: 1s
limit_mib: 2000
spike_limit_mib: 500
batch:
timeout: 10s
send_batch_size: 10000
spanmetrics:
metrics_exporter: prometheus
latency_histogram_buckets: [100ms, 250ms,500ms,1s,2s,4s,6s,8s,10s,20s,30s]
dimensions:
- name: http.method
- name: http.status_code
- name: db.operation
- name: db.statement
- name: exception.message
- name: exception.type
- name: messaging.message.id
- name: messaging.message.payload_size_bytes
dimensions_cache_size: 10000
aggregation_temporality: "AGGREGATION_TEMPORALITY_CUMULATIVE"
servicegraph:
metrics_exporter: prometheus
transform:
metric_statements:
- context: metric
statements:
- set(description, "Measures the duration of inbound HTTP requests") where name == "http.server.duration"
cumulativetodelta:
include:
metrics:
- system.network.io
- system.disk.operations
- system.network.dropped
- system.network.packets
- process.cpu.time
match_type: strict
resource:
attributes:
- key: host.id
from_attribute: host.name
action: upsert
resourcedetection:
detectors: [env, system]
k8sattributes:
auth_type: serviceAccount
passthrough: false
filter:
node_from_env_var: K8S_NODE_NAME
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
- k8s.pod.start_time
metricstransform:
transforms:
include: .+
match_type: regexp
action: update
operations:
- action: add_label
new_label: kubernetes.cluster.id
new_value: CLUSTER_ID_TO_REPLACE
- action: add_label
new_label: kubernetes.name
new_value: prod-aro
extensions:
health_check: {}
exporters:
otlp:
endpoint: "http://tempo.monitoring.svc.cluster.local:55680"
tls:
insecure: true
prometheus:
endpoint: "0.0.0.0:6666"
logging:
loglevel: info
loki:
endpoint: http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
labels:
resource:
container.name: "container_name"
k8s.cluster.name: "k8s_cluster_name"
k8s.event.reason: "k8s_event_reason"
k8s.object.kind: "k8s_object_kind"
k8s.object.name: "k8s_object_name"
k8s.object.uid: "k8s_object_uid"
k8s.object.fieldpath: "k8s_object_fieldpath"
k8s.object.api_version: "k8s_object_api_version"
attributes:
k8s.event.reason: "k8s_event_reason"
k8s.event.action: "k8s_event_action"
k8s.event.start_time: "k8s_event_start_time"
k8s.event.name: "k8s_event_name"
k8s.event.uid: "k8s_event_uid"
k8s.namespace.name: "k8s_namespace_name"
k8s.event.count: "k8s_event_count"
record:
traceID: "traceid"
service:
extensions: [health_check]
pipelines:
logs:
receivers: [k8s_events]
processors: [memory_limiter,k8sattributes,batch]
exporters: [loki,logging]
traces:
receivers: [otlp]
processors: [spanmetrics,servicegraph,batch]
exporters: [otlp]
metrics:
receivers: [otlp,prometheus]
processors: [memory_limiter,metricstransform,k8sattributes,resourcedetection,batch]
exporters: [logging,prometheus,otlp]
telemetry:
logs:
level: info
initial_fields:
service: my-prom-instance
and i want to highlight one point here as well after restart of OTEL pod tempo again show the traces on grafana. seems collector stop fetching the traces after sometime. so again after restart the OTEL service it again start to fetch traces.
tempo should show traces for the long time.
You need to update your
block_retention
configuration.In your tempo.yaml configuration, you currently have
block_retention: 1h
configured, which is why you aren't able to query traces older than 1 hour.From the Grafana Tempo documentation,