Alerts

FilesystemAlmostFull (0 active)
alert: FilesystemAlmostFull
expr: instance:node_filesystem_avail:ratio
  * 100 < 5
for: 10m
annotations:
  description: The filesystem {{ $labels.device }}:{{ $labels.mountpoint }} on {{
    $labels.instance }} has {{ $value | printf "%.2f" }}% space available.
  summary: The filesystem {{ $labels.device }}:{{ $labels.mountpoint }} is almost
    full
FilesystemFullIn1Day (0 active)
alert: FilesystemFullIn1Day
expr: predict_linear(node_filesystem_avail_bytes[6h],
  24 * 3600) < 0
for: 30m
annotations:
  description: The filesystem {{ $labels.device }}:{{ $labels.mountpoint }} on {{
    $labels.instance }} will be full in the next 24 hours.
  summary: The filesystem {{ $labels.device }}:{{ $labels.mountpoint }} will be full
    within 24 hours
HighUnicornUtilization (0 active)
alert: HighUnicornUtilization
expr: instance:unicorn_utilization:ratio
  * 100 > 90
for: 1h
annotations:
  description: Unicorn instance {{ $labels.instance }} has more than 90% worker utilization
    ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
  summary: Unicorn is has high utilization
HighgRPCResourceExhaustedRate (0 active)
alert: HighgRPCResourceExhaustedRate
expr: sum
  without(grpc_code) (job_grpc:grpc_server_handled_total:rate5m{grpc_code="ResourceExhausted"})
  / sum without(grpc_code) (job_grpc:grpc_server_handled_total:rate5m) * 100 >
  1
for: 1h
annotations:
  description: gRPC is returning more than 1% ({{ $value | printf "%.1f" }}%)
    ResourceExhausted errors over the last 60 minutes.
  summary: High gRPC ResourceExhausted error rate
PostgresDatabaseDeadlockCancels (0 active)
alert: PostgresDatabaseDeadlockCancels
expr: increase(pg_stat_database_deadlocks[5m])
  > 0
annotations:
  description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d"
    }} queries canceled due to deadlocks in the last 5 minutes.
  summary: Postgres database has queries canceled due to deadlocks
PostgresDatabaseDeadlocks (0 active)
alert: PostgresDatabaseDeadlocks
expr: increase(pg_stat_database_deadlocks[5m])
  > 0
annotations:
  description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d"
    }} deadlocks in the last 5 minutes.
  summary: Postgres database has deadlocks
PostgresDown (0 active)
alert: PostgresDown
expr: avg_over_time(pg_up[5m])
  * 100 < 50
annotations:
  description: The Postgres service {{ $labels.job }} instance {{ $labels.instance
    }} is not responding for more than 50% of the time for 5 minutes.
  summary: The Postgres service {{ $labels.job }} is not responding
RedisDown (0 active)
alert: RedisDown
expr: avg_over_time(redis_up[5m])
  * 100 < 50
annotations:
  description: The Redis service {{ $labels.job }} instance {{ $labels.instance }}
    is not responding for more than 50% of the time for 5 minutes.
  summary: The Redis service {{ $labels.job }} is not responding
ServiceDown (0 active)
alert: ServiceDown
expr: avg_over_time(up[5m])
  * 100 < 50
annotations:
  description: The service {{ $labels.job }} instance {{ $labels.instance }} is not
    responding for more than 50% of the time for 5 minutes.
  summary: The service {{ $labels.job }} is not responding
SidekiqJobsQueuing (0 active)
alert: SidekiqJobsQueuing
expr: sum
  by(name) (sidekiq_queue_size) > 0
for: 1h
annotations:
  description: Sidekiq queue {{ $labels.name }} has {{ $value }} jobs queued for 60
    minutes.
  summary: Sidekiq has jobs queued
UnicornQueueing (0 active)
alert: UnicornQueueing
expr: avg_over_time(unicorn_queued_connections[30m])
  > 1
annotations:
  description: Unicorn instance {{ $labels.instance }} is queueing requests with an
    average of {{ $value | printf "%.1f" }} over the last 30 minutes.
  summary: Unicorn is queueing requests
WorkhorseHighErrorRate (0 active)
alert: WorkhorseHighErrorRate
expr: (sum
  without(job, code) (job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."})
  / sum without(job, code) (job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m)
  < 10) * 100 > 50
annotations:
  description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has
    more than 50% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
  summary: Workhorse has high error rates
WorkhorseHighErrorRate (0 active)
alert: WorkhorseHighErrorRate
expr: (sum
  without(job, code) (job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."})
  / sum without(job, code) (job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m)
  > 10) * 100 > 10
annotations:
  description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has
    more than 10% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
  summary: Workhorse has high error rates