2024-06-15 18:17:08 +00:00
""" Prometheus monitoring queries. """
# pylint: disable=too-few-public-methods
2024-06-22 12:12:29 +00:00
import requests
2024-07-10 12:53:56 +00:00
2024-07-08 15:00:49 +00:00
import strawberry
2024-07-10 12:53:56 +00:00
2024-07-08 15:00:49 +00:00
from dataclasses import dataclass
2024-07-27 12:37:38 +00:00
from typing import Optional , Annotated , Union , List , Tuple
2024-06-25 17:25:31 +00:00
from datetime import datetime , timedelta
2024-06-15 18:17:08 +00:00
2024-06-17 18:56:58 +00:00
PROMETHEUS_URL = " http://localhost:9001 "
2024-06-15 18:17:08 +00:00
2024-07-08 15:00:49 +00:00
@strawberry.type
@dataclass
2024-07-27 12:37:38 +00:00
class MonitoringValue :
timestamp : datetime
value : str
@strawberry.type
@dataclass
class MonitoringMetric :
id : str
values : List [ MonitoringValue ]
2024-06-15 18:17:08 +00:00
2024-07-26 11:50:12 +00:00
@strawberry.type
2024-07-26 12:20:31 +00:00
class MonitoringQueryError :
2024-07-26 11:50:12 +00:00
error : str
2024-07-29 12:53:41 +00:00
@strawberry.type
class MonitoringValues :
values : List [ MonitoringValue ]
@strawberry.type
class MonitoringMetrics :
metrics : List [ MonitoringMetric ]
2024-07-27 12:37:38 +00:00
MonitoringValuesResult = Annotated [
2024-07-29 12:53:41 +00:00
Union [ MonitoringValues , MonitoringQueryError ] ,
2024-07-27 12:37:38 +00:00
strawberry . union ( " MonitoringValuesResult " ) ,
]
MonitoringMetricsResult = Annotated [
2024-07-29 12:53:41 +00:00
Union [ MonitoringMetrics , MonitoringQueryError ] ,
2024-07-27 12:37:38 +00:00
strawberry . union ( " MonitoringMetricsResult " ) ,
2024-07-26 11:50:12 +00:00
]
2024-07-26 12:20:31 +00:00
class MonitoringQueries :
2024-06-15 18:17:08 +00:00
@staticmethod
2024-07-29 15:25:21 +00:00
def _send_range_query (
2024-07-27 12:37:38 +00:00
query : str , start : int , end : int , step : int , result_type : Optional [ str ] = None
) - > Union [ dict , MonitoringQueryError ] :
2024-06-15 18:17:08 +00:00
try :
2024-07-08 15:00:49 +00:00
response = requests . get (
2024-07-26 12:39:25 +00:00
f " { PROMETHEUS_URL } /api/v1/query_range " ,
2024-07-08 15:00:49 +00:00
params = {
" query " : query ,
2024-07-16 02:41:06 +00:00
" start " : start ,
" end " : end ,
2024-07-08 15:00:49 +00:00
" step " : step ,
} ,
2024-06-15 18:17:08 +00:00
)
2024-07-08 15:00:49 +00:00
if response . status_code != 200 :
2024-07-26 12:20:31 +00:00
return MonitoringQueryError (
2024-07-29 15:33:03 +00:00
error = f " Prometheus returned unexpected HTTP status code. Error: { response . text } "
2024-07-26 11:50:12 +00:00
)
2024-07-08 15:00:49 +00:00
json = response . json ( )
2024-07-27 12:37:38 +00:00
if result_type and json [ " data " ] [ " resultType " ] != result_type :
return MonitoringQueryError (
error = " Unexpected resultType returned from Prometheus, request failed "
)
return json [ " data " ]
2024-07-08 15:00:49 +00:00
except Exception as error :
2024-07-26 12:20:31 +00:00
return MonitoringQueryError (
2024-07-26 11:50:12 +00:00
error = f " Prometheus request failed! Error: { str ( error ) } "
)
2024-06-15 18:17:08 +00:00
2024-07-29 15:25:21 +00:00
@staticmethod
def _send_query (
query : str , result_type : Optional [ str ] = None
) - > Union [ dict , MonitoringQueryError ] :
try :
response = requests . get (
f " { PROMETHEUS_URL } /api/v1/query " ,
params = {
" query " : query ,
} ,
)
if response . status_code != 200 :
return MonitoringQueryError (
2024-07-29 15:33:03 +00:00
error = f " Prometheus returned unexpected HTTP status code. Error: { response . text } "
2024-07-29 15:25:21 +00:00
)
json = response . json ( )
if result_type and json [ " data " ] [ " resultType " ] != result_type :
return MonitoringQueryError (
error = " Unexpected resultType returned from Prometheus, request failed "
)
return json [ " data " ]
except Exception as error :
return MonitoringQueryError (
error = f " Prometheus request failed! Error: { str ( error ) } "
)
2024-07-27 12:37:38 +00:00
@staticmethod
def _prometheus_value_to_monitoring_value ( x : Tuple [ int , str ] ) :
return MonitoringValue ( timestamp = datetime . fromtimestamp ( x [ 0 ] ) , value = x [ 1 ] )
2024-07-29 15:25:21 +00:00
@staticmethod
def _clean_slice_id ( slice_id : str , clean_id : bool ) - > str :
""" Slices come in form of `/slice_name.slice`, we need to remove the `.slice` and `/` part. """
if clean_id :
return slice_id . split ( " . " ) [ 0 ] . split ( " / " ) [ 1 ]
return slice_id
2024-07-27 12:37:38 +00:00
@staticmethod
2024-07-29 11:19:52 +00:00
def _prometheus_response_to_monitoring_metrics (
2024-07-29 15:25:21 +00:00
response : dict , id_key : str , clean_id : bool = False
2024-07-27 12:37:38 +00:00
) - > List [ MonitoringMetric ] :
return list (
map (
lambda x : MonitoringMetric (
2024-07-29 15:25:21 +00:00
id = MonitoringQueries . _clean_slice_id (
x [ " metric " ] [ id_key ] , clean_id = clean_id
) ,
2024-07-27 12:37:38 +00:00
values = list (
map (
MonitoringQueries . _prometheus_value_to_monitoring_value ,
x [ " values " ] ,
)
) ,
) ,
2024-07-29 11:19:52 +00:00
response [ " result " ] ,
2024-07-27 12:37:38 +00:00
)
)
2024-06-15 18:17:08 +00:00
@staticmethod
2024-07-29 15:25:21 +00:00
def _calculate_offset_and_duration (
start : datetime , end : datetime
) - > Tuple [ int , int ] :
""" Calculate the offset and duration for Prometheus queries.
They mast be in seconds .
"""
offset = int ( ( datetime . now ( ) - end ) . total_seconds ( ) )
duration = int ( ( end - start ) . total_seconds ( ) )
return offset , duration
@staticmethod
def cpu_usage_overall (
2024-07-25 13:48:34 +00:00
start : Optional [ datetime ] = None ,
end : Optional [ datetime ] = None ,
2024-07-07 12:33:15 +00:00
step : int = 60 , # seconds
2024-07-27 12:37:38 +00:00
) - > MonitoringValuesResult :
2024-07-08 15:00:49 +00:00
"""
Get CPU information .
2024-07-07 12:33:15 +00:00
Args :
2024-07-25 13:48:34 +00:00
start ( datetime , optional ) : The start time .
2024-07-07 12:33:15 +00:00
Defaults to 20 minutes ago if not provided .
2024-07-25 13:48:34 +00:00
end ( datetime , optional ) : The end time .
2024-07-07 12:33:15 +00:00
Defaults to current time if not provided .
step ( int ) : Interval in seconds for querying disk usage data .
"""
2024-07-25 13:48:34 +00:00
if start is None :
start = datetime . now ( ) - timedelta ( minutes = 20 )
2024-07-07 12:33:15 +00:00
2024-07-25 13:48:34 +00:00
if end is None :
end = datetime . now ( )
start_timestamp = int ( start . timestamp ( ) )
end_timestamp = int ( end . timestamp ( ) )
2024-06-21 16:33:37 +00:00
2024-06-15 18:17:08 +00:00
query = ' 100 - (avg by (instance) (rate(node_cpu_seconds_total { mode= " idle " }[5m])) * 100) '
2024-07-29 15:25:21 +00:00
data = MonitoringQueries . _send_range_query (
2024-07-27 12:37:38 +00:00
query , start_timestamp , end_timestamp , step , result_type = " matrix "
)
if isinstance ( data , MonitoringQueryError ) :
return data
2024-07-29 12:53:41 +00:00
return MonitoringValues (
values = list (
map (
MonitoringQueries . _prometheus_value_to_monitoring_value ,
data [ " result " ] [ 0 ] [ " values " ] ,
)
2024-07-27 12:37:38 +00:00
)
2024-07-25 13:48:34 +00:00
)
2024-07-08 15:18:07 +00:00
@staticmethod
2024-07-29 15:25:21 +00:00
def memory_usage_overall (
2024-07-25 13:48:34 +00:00
start : Optional [ datetime ] = None ,
end : Optional [ datetime ] = None ,
2024-07-08 15:18:07 +00:00
step : int = 60 , # seconds
2024-07-27 12:37:38 +00:00
) - > MonitoringValuesResult :
2024-07-08 15:18:07 +00:00
"""
Get memory usage .
Args :
2024-07-25 13:48:34 +00:00
start ( datetime , optional ) : The start time .
2024-07-08 15:18:07 +00:00
Defaults to 20 minutes ago if not provided .
2024-07-25 13:48:34 +00:00
end ( datetime , optional ) : The end time .
2024-07-08 15:18:07 +00:00
Defaults to current time if not provided .
step ( int ) : Interval in seconds for querying memory usage data .
"""
2024-07-25 13:48:34 +00:00
if start is None :
start = datetime . now ( ) - timedelta ( minutes = 20 )
if end is None :
end = datetime . now ( )
2024-07-08 15:18:07 +00:00
2024-07-25 13:48:34 +00:00
start_timestamp = int ( start . timestamp ( ) )
end_timestamp = int ( end . timestamp ( ) )
2024-07-08 15:18:07 +00:00
query = " 100 - (100 * (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) "
2024-07-29 15:25:21 +00:00
data = MonitoringQueries . _send_range_query (
2024-07-27 12:37:38 +00:00
query , start_timestamp , end_timestamp , step , result_type = " matrix "
)
if isinstance ( data , MonitoringQueryError ) :
return data
2024-07-29 12:53:41 +00:00
return MonitoringValues (
values = list (
map (
MonitoringQueries . _prometheus_value_to_monitoring_value ,
data [ " result " ] [ 0 ] [ " values " ] ,
)
2024-07-27 12:37:38 +00:00
)
2024-07-25 13:48:34 +00:00
)
2024-06-15 18:17:08 +00:00
@staticmethod
2024-07-29 15:25:21 +00:00
def memory_usage_max_by_slice (
start : Optional [ datetime ] = None ,
end : Optional [ datetime ] = None ,
) - > MonitoringMetricsResult :
"""
Get maximum memory usage for each service ( i . e . systemd slice ) .
Args :
start ( datetime , optional ) : The start time .
Defaults to 20 minutes ago if not provided .
end ( datetime , optional ) : The end time .
Defaults to current time if not provided .
"""
if start is None :
start = datetime . now ( ) - timedelta ( minutes = 20 )
if end is None :
end = datetime . now ( )
offset , duration = MonitoringQueries . _calculate_offset_and_duration ( start , end )
query = f ' max_over_time(container_memory_rss {{ id!~ " .*slice.*slice " , id=~ " .*slice " }} [ { duration } s] offset { offset } s) '
data = MonitoringQueries . _send_query ( query , result_type = " vector " )
if isinstance ( data , MonitoringQueryError ) :
return data
return MonitoringMetrics (
metrics = MonitoringQueries . _prometheus_response_to_monitoring_metrics (
data , " id " , clean_id = True
)
)
@staticmethod
def memory_usage_average_by_slice (
start : Optional [ datetime ] = None ,
end : Optional [ datetime ] = None ,
) - > MonitoringMetricsResult :
"""
Get average memory usage for each service ( i . e . systemd slice ) .
Args :
start ( datetime , optional ) : The start time .
Defaults to 20 minutes ago if not provided .
end ( datetime , optional ) : The end time .
Defaults to current time if not provided .
"""
if start is None :
start = datetime . now ( ) - timedelta ( minutes = 20 )
if end is None :
end = datetime . now ( )
offset , duration = MonitoringQueries . _calculate_offset_and_duration ( start , end )
query = f ' avg_over_time(container_memory_rss {{ id!~ " .*slice.*slice " , id=~ " .*slice " }} [ { duration } s] offset { offset } s) '
data = MonitoringQueries . _send_query ( query , result_type = " vector " )
if isinstance ( data , MonitoringQueryError ) :
return data
return MonitoringMetrics (
metrics = MonitoringQueries . _prometheus_response_to_monitoring_metrics (
data , " id " , clean_id = True
)
)
@staticmethod
def disk_usage_overall (
2024-07-25 13:48:34 +00:00
start : Optional [ datetime ] = None ,
end : Optional [ datetime ] = None ,
2024-07-07 12:33:15 +00:00
step : int = 60 , # seconds
2024-07-27 12:37:38 +00:00
) - > MonitoringMetricsResult :
2024-07-07 12:33:15 +00:00
"""
Get disk usage information .
Args :
2024-07-25 13:48:34 +00:00
start ( datetime , optional ) : The start time .
2024-07-07 12:33:15 +00:00
Defaults to 20 minutes ago if not provided .
2024-07-25 13:48:34 +00:00
end ( datetime , optional ) : The end time .
2024-07-07 12:33:15 +00:00
Defaults to current time if not provided .
step ( int ) : Interval in seconds for querying disk usage data .
"""
2024-07-25 13:48:34 +00:00
if start is None :
start = datetime . now ( ) - timedelta ( minutes = 20 )
if end is None :
end = datetime . now ( )
2024-07-07 12:33:15 +00:00
2024-07-25 13:48:34 +00:00
start_timestamp = int ( start . timestamp ( ) )
end_timestamp = int ( end . timestamp ( ) )
2024-07-07 12:33:15 +00:00
2024-07-16 02:41:06 +00:00
query = """ 100 - (100 * sum by (device) (node_filesystem_avail_bytes { fstype!= " rootfs " }) / sum by (device) (node_filesystem_size_bytes { fstype!= " rootfs " })) """
2024-06-16 19:01:25 +00:00
2024-07-29 15:25:21 +00:00
data = MonitoringQueries . _send_range_query (
2024-07-27 12:37:38 +00:00
query , start_timestamp , end_timestamp , step , result_type = " matrix "
)
if isinstance ( data , MonitoringQueryError ) :
return data
2024-07-29 12:53:41 +00:00
return MonitoringMetrics (
metrics = MonitoringQueries . _prometheus_response_to_monitoring_metrics (
data , " device "
)
2024-07-25 13:48:34 +00:00
)
2024-07-25 16:01:48 +00:00
@staticmethod
2024-07-29 15:25:21 +00:00
def network_usage_overall (
2024-07-25 16:01:48 +00:00
start : Optional [ datetime ] = None ,
end : Optional [ datetime ] = None ,
step : int = 60 , # seconds
2024-07-27 12:37:38 +00:00
) - > MonitoringMetricsResult :
2024-07-25 16:01:48 +00:00
"""
Get network usage information for both download and upload .
Args :
start ( datetime , optional ) : The start time .
Defaults to 20 minutes ago if not provided .
end ( datetime , optional ) : The end time .
Defaults to current time if not provided .
step ( int ) : Interval in seconds for querying network data .
"""
if start is None :
start = datetime . now ( ) - timedelta ( minutes = 20 )
if end is None :
end = datetime . now ( )
start_timestamp = int ( start . timestamp ( ) )
end_timestamp = int ( end . timestamp ( ) )
query = """
2024-07-29 15:25:21 +00:00
label_replace ( rate ( node_network_receive_bytes_total { device != " lo " } [ 5 m ] ) , " direction " , " receive " , " device " , " .* " )
or
label_replace ( rate ( node_network_transmit_bytes_total { device != " lo " } [ 5 m ] ) , " direction " , " transmit " , " device " , " .* " )
2024-07-25 16:01:48 +00:00
"""
2024-07-29 15:25:21 +00:00
data = MonitoringQueries . _send_range_query (
2024-07-27 12:37:38 +00:00
query , start_timestamp , end_timestamp , step , result_type = " matrix "
)
if isinstance ( data , MonitoringQueryError ) :
return data
2024-07-29 12:53:41 +00:00
return MonitoringMetrics (
metrics = MonitoringQueries . _prometheus_response_to_monitoring_metrics (
2024-07-29 15:25:21 +00:00
data , " directon "
2024-07-29 12:53:41 +00:00
)
2024-07-25 16:01:48 +00:00
)