2024-06-15 18:17:08 +00:00
""" Prometheus monitoring queries. """
# pylint: disable=too-few-public-methods
2024-06-22 12:12:29 +00:00
import requests
2024-07-10 12:53:56 +00:00
2024-07-08 15:00:49 +00:00
import strawberry
2024-07-10 12:53:56 +00:00
2024-07-08 15:00:49 +00:00
from dataclasses import dataclass
2024-07-27 12:37:38 +00:00
from typing import Optional , Annotated , Union , List , Tuple
2024-06-25 17:25:31 +00:00
from datetime import datetime , timedelta
2024-06-15 18:17:08 +00:00
2024-06-17 18:56:58 +00:00
PROMETHEUS_URL = " http://localhost:9001 "
2024-06-15 18:17:08 +00:00
2024-07-08 15:00:49 +00:00
@strawberry.type
@dataclass
2024-07-27 12:37:38 +00:00
class MonitoringValue :
timestamp : datetime
value : str
@strawberry.type
@dataclass
class MonitoringMetric :
id : str
values : List [ MonitoringValue ]
2024-06-15 18:17:08 +00:00
2024-07-26 11:50:12 +00:00
@strawberry.type
2024-07-26 12:20:31 +00:00
class MonitoringQueryError :
2024-07-26 11:50:12 +00:00
error : str
2024-07-27 12:37:38 +00:00
MonitoringValuesResult = Annotated [
Union [ List [ MonitoringValue ] , MonitoringQueryError ] ,
strawberry . union ( " MonitoringValuesResult " ) ,
]
MonitoringMetricsResult = Annotated [
Union [ List [ MonitoringMetric ] , MonitoringQueryError ] ,
strawberry . union ( " MonitoringMetricsResult " ) ,
2024-07-26 11:50:12 +00:00
]
2024-07-26 12:20:31 +00:00
class MonitoringQueries :
2024-06-15 18:17:08 +00:00
@staticmethod
2024-07-27 12:37:38 +00:00
def _send_query (
query : str , start : int , end : int , step : int , result_type : Optional [ str ] = None
) - > Union [ dict , MonitoringQueryError ] :
2024-06-15 18:17:08 +00:00
try :
2024-07-08 15:00:49 +00:00
response = requests . get (
2024-07-26 12:39:25 +00:00
f " { PROMETHEUS_URL } /api/v1/query_range " ,
2024-07-08 15:00:49 +00:00
params = {
" query " : query ,
2024-07-16 02:41:06 +00:00
" start " : start ,
" end " : end ,
2024-07-08 15:00:49 +00:00
" step " : step ,
} ,
2024-06-15 18:17:08 +00:00
)
2024-07-08 15:00:49 +00:00
if response . status_code != 200 :
2024-07-26 12:20:31 +00:00
return MonitoringQueryError (
2024-07-26 11:50:12 +00:00
error = " Prometheus returned unexpected HTTP status code "
)
2024-07-08 15:00:49 +00:00
json = response . json ( )
2024-07-27 12:37:38 +00:00
if result_type and json [ " data " ] [ " resultType " ] != result_type :
return MonitoringQueryError (
error = " Unexpected resultType returned from Prometheus, request failed "
)
return json [ " data " ]
2024-07-08 15:00:49 +00:00
except Exception as error :
2024-07-26 12:20:31 +00:00
return MonitoringQueryError (
2024-07-26 11:50:12 +00:00
error = f " Prometheus request failed! Error: { str ( error ) } "
)
2024-06-15 18:17:08 +00:00
2024-07-27 12:37:38 +00:00
@staticmethod
def _prometheus_value_to_monitoring_value ( x : Tuple [ int , str ] ) :
return MonitoringValue ( timestamp = datetime . fromtimestamp ( x [ 0 ] ) , value = x [ 1 ] )
@staticmethod
2024-07-29 11:19:52 +00:00
def _prometheus_response_to_monitoring_metrics (
response : dict , id_key : str
2024-07-27 12:37:38 +00:00
) - > List [ MonitoringMetric ] :
return list (
map (
lambda x : MonitoringMetric (
id = x [ " metric " ] [ id_key ] ,
values = list (
map (
MonitoringQueries . _prometheus_value_to_monitoring_value ,
x [ " values " ] ,
)
) ,
) ,
2024-07-29 11:19:52 +00:00
response [ " result " ] ,
2024-07-27 12:37:38 +00:00
)
)
2024-06-15 18:17:08 +00:00
@staticmethod
2024-06-21 16:33:37 +00:00
def cpu_usage (
2024-07-25 13:48:34 +00:00
start : Optional [ datetime ] = None ,
end : Optional [ datetime ] = None ,
2024-07-07 12:33:15 +00:00
step : int = 60 , # seconds
2024-07-27 12:37:38 +00:00
) - > MonitoringValuesResult :
2024-07-08 15:00:49 +00:00
"""
Get CPU information .
2024-07-07 12:33:15 +00:00
Args :
2024-07-25 13:48:34 +00:00
start ( datetime , optional ) : The start time .
2024-07-07 12:33:15 +00:00
Defaults to 20 minutes ago if not provided .
2024-07-25 13:48:34 +00:00
end ( datetime , optional ) : The end time .
2024-07-07 12:33:15 +00:00
Defaults to current time if not provided .
step ( int ) : Interval in seconds for querying disk usage data .
"""
2024-07-25 13:48:34 +00:00
if start is None :
start = datetime . now ( ) - timedelta ( minutes = 20 )
2024-07-07 12:33:15 +00:00
2024-07-25 13:48:34 +00:00
if end is None :
end = datetime . now ( )
start_timestamp = int ( start . timestamp ( ) )
end_timestamp = int ( end . timestamp ( ) )
2024-06-21 16:33:37 +00:00
2024-06-15 18:17:08 +00:00
query = ' 100 - (avg by (instance) (rate(node_cpu_seconds_total { mode= " idle " }[5m])) * 100) '
2024-07-27 12:37:38 +00:00
data = MonitoringQueries . _send_query (
query , start_timestamp , end_timestamp , step , result_type = " matrix "
)
if isinstance ( data , MonitoringQueryError ) :
return data
return list (
map (
MonitoringQueries . _prometheus_value_to_monitoring_value ,
data [ " result " ] [ 0 ] [ " values " ] ,
)
2024-07-25 13:48:34 +00:00
)
2024-07-08 15:18:07 +00:00
@staticmethod
def memory_usage (
2024-07-25 13:48:34 +00:00
start : Optional [ datetime ] = None ,
end : Optional [ datetime ] = None ,
2024-07-08 15:18:07 +00:00
step : int = 60 , # seconds
2024-07-27 12:37:38 +00:00
) - > MonitoringValuesResult :
2024-07-08 15:18:07 +00:00
"""
Get memory usage .
Args :
2024-07-25 13:48:34 +00:00
start ( datetime , optional ) : The start time .
2024-07-08 15:18:07 +00:00
Defaults to 20 minutes ago if not provided .
2024-07-25 13:48:34 +00:00
end ( datetime , optional ) : The end time .
2024-07-08 15:18:07 +00:00
Defaults to current time if not provided .
step ( int ) : Interval in seconds for querying memory usage data .
"""
2024-07-25 13:48:34 +00:00
if start is None :
start = datetime . now ( ) - timedelta ( minutes = 20 )
if end is None :
end = datetime . now ( )
2024-07-08 15:18:07 +00:00
2024-07-25 13:48:34 +00:00
start_timestamp = int ( start . timestamp ( ) )
end_timestamp = int ( end . timestamp ( ) )
2024-07-08 15:18:07 +00:00
query = " 100 - (100 * (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) "
2024-07-27 12:37:38 +00:00
data = MonitoringQueries . _send_query (
query , start_timestamp , end_timestamp , step , result_type = " matrix "
)
if isinstance ( data , MonitoringQueryError ) :
return data
return list (
map (
MonitoringQueries . _prometheus_value_to_monitoring_value ,
data [ " result " ] [ 0 ] [ " values " ] ,
)
2024-07-25 13:48:34 +00:00
)
2024-06-15 18:17:08 +00:00
@staticmethod
2024-06-21 16:33:37 +00:00
def disk_usage (
2024-07-25 13:48:34 +00:00
start : Optional [ datetime ] = None ,
end : Optional [ datetime ] = None ,
2024-07-07 12:33:15 +00:00
step : int = 60 , # seconds
2024-07-27 12:37:38 +00:00
) - > MonitoringMetricsResult :
2024-07-07 12:33:15 +00:00
"""
Get disk usage information .
Args :
2024-07-25 13:48:34 +00:00
start ( datetime , optional ) : The start time .
2024-07-07 12:33:15 +00:00
Defaults to 20 minutes ago if not provided .
2024-07-25 13:48:34 +00:00
end ( datetime , optional ) : The end time .
2024-07-07 12:33:15 +00:00
Defaults to current time if not provided .
step ( int ) : Interval in seconds for querying disk usage data .
"""
2024-07-25 13:48:34 +00:00
if start is None :
start = datetime . now ( ) - timedelta ( minutes = 20 )
if end is None :
end = datetime . now ( )
2024-07-07 12:33:15 +00:00
2024-07-25 13:48:34 +00:00
start_timestamp = int ( start . timestamp ( ) )
end_timestamp = int ( end . timestamp ( ) )
2024-07-07 12:33:15 +00:00
2024-07-16 02:41:06 +00:00
query = """ 100 - (100 * sum by (device) (node_filesystem_avail_bytes { fstype!= " rootfs " }) / sum by (device) (node_filesystem_size_bytes { fstype!= " rootfs " })) """
2024-06-16 19:01:25 +00:00
2024-07-27 12:37:38 +00:00
data = MonitoringQueries . _send_query (
query , start_timestamp , end_timestamp , step , result_type = " matrix "
)
if isinstance ( data , MonitoringQueryError ) :
return data
2024-07-29 11:19:52 +00:00
return MonitoringQueries . _prometheus_response_to_monitoring_metrics (
2024-07-27 12:37:38 +00:00
data , " device "
2024-07-25 13:48:34 +00:00
)
2024-07-25 16:01:48 +00:00
@staticmethod
def network_usage (
start : Optional [ datetime ] = None ,
end : Optional [ datetime ] = None ,
step : int = 60 , # seconds
2024-07-27 12:37:38 +00:00
) - > MonitoringMetricsResult :
2024-07-25 16:01:48 +00:00
"""
Get network usage information for both download and upload .
Args :
start ( datetime , optional ) : The start time .
Defaults to 20 minutes ago if not provided .
end ( datetime , optional ) : The end time .
Defaults to current time if not provided .
step ( int ) : Interval in seconds for querying network data .
"""
if start is None :
start = datetime . now ( ) - timedelta ( minutes = 20 )
if end is None :
end = datetime . now ( )
start_timestamp = int ( start . timestamp ( ) )
end_timestamp = int ( end . timestamp ( ) )
query = """
(
sum ( rate ( node_network_receive_bytes_total { device != " lo " } [ 5 m ] ) ) as download ,
sum ( rate ( node_network_transmit_bytes_total { device != " lo " } [ 5 m ] ) ) as upload
)
"""
2024-07-27 12:37:38 +00:00
data = MonitoringQueries . _send_query (
query , start_timestamp , end_timestamp , step , result_type = " matrix "
)
if isinstance ( data , MonitoringQueryError ) :
return data
2024-07-29 11:19:52 +00:00
return MonitoringQueries . _prometheus_response_to_monitoring_metrics (
2024-07-27 12:37:38 +00:00
data , " device "
2024-07-25 16:01:48 +00:00
)