diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f974de808716834067eb52249ca615229c46db1f..c24d77f9cd9a3f6e787fc614eace8a55007c9a13 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -35,8 +35,8 @@ build:tests: - python setup.py sdist --dist-dir=$CI_PROJECT_DIR/build artifacts: paths: - - build/clmctest-2.3.0.tar.gz - - build/clmcservice-2.3.0.tar.gz + - build/clmctest-2.3.1.tar.gz + - build/clmcservice-2.3.1.tar.gz expire_in: 1 day test:all: @@ -50,8 +50,8 @@ test:all: - echo "REPO_PASS=${REPO_PASS}" >> $CI_PROJECT_DIR/reporc - sudo scripts/test/fixture.sh create -f src/test/clmctest/rspec.json -r $CI_PROJECT_DIR -c all - sudo mkdir /var/lib/lxd/containers/test-runner/rootfs/opt/clmc/build - - sudo cp build/clmctest-2.3.0.tar.gz /var/lib/lxd/containers/test-runner/rootfs/opt/clmc/build - - sudo lxc exec test-runner -- pip3 install /opt/clmc/build/clmctest-2.3.0.tar.gz + - sudo cp build/clmctest-2.3.1.tar.gz /var/lib/lxd/containers/test-runner/rootfs/opt/clmc/build + - sudo lxc exec test-runner -- pip3 install /opt/clmc/build/clmctest-2.3.1.tar.gz - sudo lxc exec test-runner -- pytest -s --tb=short -rfp --pyargs clmctest when: on_success diff --git a/README.md b/README.md index 51441f88aed0de406beb01ea9a67378b33d83f09..0fba7df64b563baa34c38e8db3b3365caafd0274 100644 --- a/README.md +++ b/README.md @@ -122,10 +122,12 @@ pytest -s --pyargs clmctest.monitoring #### CI Testing -A lxd container is setup on givry called `flame-clmc-ci`. The container is priviledged and allows for nested containers. The container was created using the following commands +A lxd container is setup on givry called `flame-clmc-ci`. The container is priviledged and allows for nested containers. The container was created using the following commands. + +Note that the container only works on the `default` storage pool and not the large storage device `pool2` ``` -lxc launch ubuntu:18.04 flame-clmc-ci -c security.privileged=true -c security.nesting=true +lxc launch ubuntu:18.04 flame-clmc-ci -c security.privileged=true -c security.nesting=true ``` the container is then started and LXD initialised diff --git a/docs/AlertsSpecification.md b/docs/AlertsSpecification.md index e9c432fe89a86278b3ffc96087fb8c7a9e26c4ab..2eae1cc049929da24c7a04f76308ca6ea353a37e 100644 --- a/docs/AlertsSpecification.md +++ b/docs/AlertsSpecification.md @@ -94,7 +94,7 @@ topology_template: resource_type: flame_sfp: storage flame_sf: storage-users - location: watershed + flame_location: watershed comparison_operator: lt action: implementation: @@ -106,8 +106,7 @@ topology_template: triggers: increase_in_requests: description: | - This event triggers when the number of requests has increased relative to the number of requests received - 120 seconds ago. + This event triggers when the average number of requests has increased relative to the average received 120 seconds ago. event_type: relative metric: storage.requests condition: @@ -125,12 +124,13 @@ topology_template: - flame_sfemc decrease_in_requests: description: | - This event triggers when the number of requests has decreased relative to the number of requests received - 120 seconds ago. + This event triggers when the average number of requests has decreased relative to the average received 120 seconds ago. + metadata: + percentage_evaluation: true # difference between current and past value is computed as percentage event_type: relative metric: storage.requests condition: - threshold: -100 # requests have decreased by at least 100 + threshold: -50 # requests have decreased by at least 50% granularity: 120 aggregation_method: mean resource_type: @@ -146,14 +146,13 @@ topology_template: type: eu.ict-flame.policies.Alert triggers: missing_storage_measurements: - description: This event triggers when the number of storage measurements reported falls below the threshold value. + description: This event triggers when the number of reported storage measurement points falls below the threshold value. event_type: deadman - # deadman trigger instances monitor the whole measurement (storage in this case), so simply put a star for field value - # to be compliant with the <measurement>.<field> format + # deadman triggers monitor the whole measurement (storage in this case), so simply put a star for the field value to be compliant with the <measurement>.<field> format metric: storage.* condition: threshold: 0 # if requests are less than or equal to 0 (in other words, no measurements are reported) - granularity: 60 # check for for missing data for the last 60 seconds + granularity: 60 # check for missing data for the last 60 seconds resource_type: flame_sfp: storage action: @@ -164,8 +163,7 @@ topology_template: ##### Metadata -The ***metadata*** section specifies the service function chain ID, for which this -alerts specification relates to. The format is the following: +The ***metadata*** section specifies the service function chain ID for which this alerts specification relates to. The format is the following: ```yaml metadata: @@ -174,8 +172,7 @@ metadata: ##### Policies -The ***policies*** section defines a list of policy nodes, each representing a fully qualified configuration for an -alert within CLMC. Each policy must be of type eu.ict-flame.policies.Alert. The format is the following: +The ***policies*** section defines a list of policy nodes, each of which includes a set of triggers representing fully qualified configurations of an alert within CLMC. Each policy must be of type eu.ict-flame.policies.Alert. The format is the following: ```yaml topology_template: @@ -189,7 +186,7 @@ topology_template: event_type: <threshold | relative | deadman> metric: <measurement>.<field> condition: - threshold: <critical value - semantics depend on the event type> + threshold: <critical value - semantic depends on the event type> granularity: <period in seconds - semantic depends on the event type> aggregation_method: <aggregation function supported by InfluxDB - e.g. 'mean'> resource_type: @@ -208,19 +205,13 @@ topology_template: ##### Definitions -* **policy_identifier** - policy label which MUST match with a StateChange policy in the TOSCA resource specification document -submitted to the FLAME Orchestrator. +* **policy_identifier** - policy label which MUST match a StateChange policy in the TOSCA resource specification document submitted to the FLAME Orchestrator. -* **event_identifier** - the name of the event that **MUST** match with the *constraint* event name referenced in the TOSCA resource -specification document submitted to the FLAME Orchestrator. +* **event_identifier** - the name of the event which MUST match with the *constraint* event name referenced in the TOSCA resource specification document submitted to the FLAME Orchestrator. -* **event_type** - the type of TICK Script template to use to create the alert - more information will be provided about -the different options here, but we assume the most common one will be **threshold**. Currently, the other supported types are -**relative** and **deadman**. +* **event_type** - the type of TICK Script template to use to create the alert - more information will be provided about the different options here, but we assume the most common one will be **threshold**. Currently, the other supported types are **relative** and **deadman**. These are also the main Kapacitor tasks that can be created through Chronograf. -* **metric** - the metric to query in InfluxDB, must include measurement name and field name in -format `<measurement>`.`<field>`. The only exception is when a **deadman** event type is used - then the `<field>`is not used, but -the format is still the same for consistency. Therefore, using `<measurement>.*` will be sufficient. +* **metric** - the metric to query in InfluxDB, must include measurement name and field name in format `<measurement>`.`<field>`. The only exception is when a **deadman** event type is used - then the `<field>`is not used, but the format is still the same for consistency. Therefore, using `<measurement>.*` will be sufficient. * **threshold** - * for **threshold** event type, this is the critical value the queried metric is compared to. @@ -232,30 +223,35 @@ the format is still the same for consistency. Therefore, using `<measurement>.*` * for **relative** event type, this value specifies how long back in time to compare the current metric value with * for **deadman** event type, this value specifies how long the span in time (in which the number of measurement points are checked) is -* **aggregation_method** - the function to use when querying InfluxDB, e.g. median, mean, etc. This value is only used when -the event_type is set to **threshold** or **relative**. +* **aggregation_method** - the aggregation function to use when querying InfluxDB in batch mode, e.g. median, mean, etc. This value is only used when the event_type is set to **threshold** or **relative**. + + The currently included InfluxQL functions are: + + `"count", "mean", "median", "mode", "sum", "first", "last", "max", "min", "spread", "stddev"` + +* **resource_type** - provides context for the given event - key-value pairs for the global tags of the CLMC Information Model or measurement-specific tags. -* **resource_type** - provides context for the given event - key-value pairs for the global tags of the CLMC Information Model. -This includes any of the following: `"flame_sfp", "flame_sf", "flame_server", "flame_location"`. -Keep in mind that **flame_sfc** and **flame_sfci** are also part of the CLMC Information Model. However, filtering on -these tags is automatically generated and added to all InfluxDB queries by using the metadata values from the -alerts specification. Therefore, including **flame_sfc** and **flame_sfci** in the **resource_type** is considered INVALID. -For more information on the global tags, please check the [documentation](monitoring.md). In addition, this could include -custom tags which are part of a specific measurement. + This includes any of the following: `"flame_sfp", "flame_sf", "flame_server", "flame_location"`. Keep in mind that **flame_sfc** and **flame_sfci** are also part of the CLMC Information Model. However, filtering on these tags is automatically generated and added to all InfluxDB queries by using the metadata values from the alerts specification. Therefore, including **flame_sfc** and **flame_sfci** in the **resource_type** is considered **INVALID**. For more information on the global tags, please check the [documentation](monitoring.md). In addition, the resource type could include custom tags which are part of a service-specific measurement. * **comparison_operator** - the logical operator to use for comparison - lt (less than), gt (greater than), lte (less than or equal to), etc. + + The comparison operator mappings are as follows: + + ``` + "lt" : "less than", + "gt" : "greater than", + "lte" : "less than or equal to", + "gte" : "greater than or equal to", + "eq" : "equal", + "neq" : "not equal" + ``` -* **implementation** - a list of the URL entries for alert handlers to which alert data is sent when the event condition is true. -If the alert is supposed to be sent to SFEMC, then instead of typing a URL, use **flame_sfemc** - the configurator will generate the correct -SFEMC URL. +* **implementation** - a list of the URL entries for alert handlers to which alert data is sent when the event condition is true. If the alert is supposed to be sent to SFEMC, then instead of typing a URL, use **flame_sfemc** - the configurator will generate the correct SFEMC URL. ##### Event types -* **threshold** - A threshold event type is an alert in which Kapacitor queries InfluxDB on specific metric in a given period of time -by using a query function such as *mean*, *median*, *mode*, etc. If the granularity is less than or equal to 60 seconds, then every measurement -point is monitored (improving performance), thus, ignoring the aggregation function. This value is then compared against a given threshold. If the -result of the comparison operation is true, an alert is triggered. For example: +* **threshold** - A threshold event type is an alert in which Kapacitor queries InfluxDB for a specific metric in a given period of time by using a query function such as *mean*, *median*, *mode*, etc. If the granularity is less than or equal to 60 seconds, then every measurement point is monitored (improving performance), thus, ignoring the aggregation function. This value is then compared against a given threshold. If the result of the comparison operation is true, an alert is triggered. For example: ```yaml high_latency: @@ -275,35 +271,14 @@ result of the comparison operation is true, an alert is triggered. For example: - http://companyA.alert-handler.flame.eu/high-latency ``` - This trigger specification will create an alert task in Kapacitor, which queries the **latency** field in the **network** - measurement on location **watershed** every **120** seconds and compares the mean value for the last 120 seconds with the threshold value **45**. - If the mean latency exceeds 45 (**gt** operator is used, which stands for **greater than**), an alert is triggered. This alert will - be sent through an HTTP POST message to the URLs listed in the **implementation** section. - - The currently included InfluxQL functions are: - - `"count", "mean", "median", "mode", "sum", "first", "last", "max", "min"` - - The comparison operator mappings are as follows: - - ``` - "lt" : "less than", - "gt" : "greater than", - "lte" : "less than or equal to", - "gte" : "greater than or equal to", - "eq" : "equal", - "neq" : "not equal" - ``` + This trigger specification will create an alert task in Kapacitor, which queries the **latency** field in the **network** measurement for location **watershed** every **120** seconds and compares the mean value for the last 120 seconds with the threshold value **45**. If the mean latency exceeds 45 (**gt** operator is used, which stands for **greater than**), an alert is triggered. This alert will be sent through an HTTP POST message to the URLs listed in the **implementation** section. -* **relative** - A relative event type is an alert in which Kapacitor computes the difference between the current aggregated value of a metric and the aggregated value -reported a given period of time ago. The difference between the current and the past value is then compared against a given -threshold. If the result of the comparison operation is true, an alert is triggered. For example: +* **relative** - A relative event type is an alert in which Kapacitor computes the difference between the current aggregated value of a metric and the aggregated value reported a given period of time ago. The difference between the current and the past value (could be raw difference, i.e. `current - past`, or percentage difference, i.e. `100 * (current - past) / past`) is then compared against a given threshold. If the result of the comparison operation is true, an alert is triggered. For example: ```yaml decrease_in_requests: description: | - This event triggers when the number of requests has decreased relative to the number of requests received - 120 seconds ago. + This event triggers when the number of requests has decreased relative to the number of requests received 120 seconds ago. event_type: relative metric: storage.requests condition: @@ -320,23 +295,36 @@ threshold. If the result of the comparison operation is true, an alert is trigge - flame_sfemc ``` - This trigger specification will create an alert task in Kapacitor, which compares the mean **requests** value reported in measurement **storage** - with the mean value received **120** seconds ago. If the difference between the current and the past - value is less than or equal to (comparison operator is **lte**) **-100**, an alert is triggered. Simply explained, an alert - is triggered if the **requests** current value has decreased by at least 100 relative to the value reported 120 seconds ago. - The queried value is contextualised for service function **storage-users** (using service function package **storage**) - at location **watershed**. Triggered alerts will be sent through an HTTP POST message to the URLs listed in the **implementation** section. + This trigger specification will create an alert task in Kapacitor, which compares the mean **requests** value reported in measurement **storage** with the mean value received **120** seconds ago. If the difference between the current and the past value is less than or equal to (comparison operator is **lte**) **-100**, an alert is triggered. Simply explained, an alert is triggered if the **requests** current value has decreased by at least 100 relative to the value reported 120 seconds ago. The queried value is contextualised for service function **storage-users** (using service function package **storage**) at location **watershed**. Triggered alerts will be sent through an HTTP POST message to the URLs listed in the **implementation** section. *Notes*: - * **aggregation_method** is not required here - the alert task compares the actual value that's being reported (stream mode) - * if **aggregation_method** is provided, it will be ignored - * if X is the current timestamp, the current aggregated value refers to the period {X - granularity; X} while the past aggregated value refers to the period {X - 2*granularity; X - granularity} + * if X is the current timestamp, the current aggregated value refers to the value for period {X - granularity; X} while the past aggregated value refers to the value for period {X - 2*granularity; X - granularity} + * this configuration uses the raw difference between the current and the past value, i.e. `current - past` + * to use percentage difference, i.e. `100 * (currnet - past) / past`, set the metadata *percentage_evaluation* flag to true (defaults to false), for example: + + ```yaml + percentage_decrease_in_requests: + description: This event triggers when the number of requests has decreased relative to the number of requests received 120 seconds ago. + metadata: + percentage_evaluation: true # difference between current and past value is computed as percentage + event_type: relative + metric: storage.requests + condition: + threshold: 50 # the average number of requests has increased by at least 50% relative to the average 120 seconds ago + granularity: 120 + aggregation_method: mean + resource_type: + flame_sfp: storage + flame_sf: storage-users + flame_location: watershed + comparison_operator: gte + action: + implementation: + - flame_sfemc + ``` -* **deadman** - A deadman event type is an alert in which Kapacitor computes the number of reported points in a measurement -for a given period of time. This number is then compared to a given threshold value. If less number of points have been -reported (in comparison with the threshold value), an alert is triggered. -For example: +* **deadman** - A deadman event type is an alert in which Kapacitor computes the number of reported points in a measurement for a given period of time. This number is then compared to a given threshold value. If less number of points have been reported (in comparison with the threshold value), an alert is triggered. For example: ```yaml missing_storage_measurements: @@ -353,20 +341,33 @@ For example: - flame_sfemc ``` - This trigger specification will create an alert task in Kapacitor, which monitors the number of points reported in - measurement **storage** and having tag **sfp** set as **storage**. This value is computed every 60 seconds. - If the number of reported points is less than or equal to **0** (no points have been reported for the last 60 seconds), an alert - will be triggered. Triggered alerts will be sent through an HTTP POST message to the URLs listed in the **implementation** section. + This trigger specification will create an alert task in Kapacitor which monitors the number of points reported in measurement **storage** and having tag **flame_sfp** set as **storage**. This value is computed every 60 seconds. If the number of reported points is less than or equal to **0** (no points have been reported for the last 60 seconds), an alert will be triggered. Triggered alerts will be sent through an HTTP POST message to the URLs listed in the **implementation** section. *Notes*: - * **metric** only requires the measurement name in this event type and doesn't require a field name - * the trigger specification still needs to be consistent with the parsing rule for **metric**: `<measurement>`.`<field>` - * simply putting a `*` for field is sufficient, e.g. `storage.*` - * even if you put something else for field value, it will be ignored - only the **measurement** name is used - * **aggregation_method** is not required in this event type, any values provided will be ignored - * **comparison operator** is not required in this event type, any values provided will be ignored - + * **metric** only requires the measurement name in this event type and does not require a field name + * the trigger specification still needs to be consistent with the parsing rule for **metric**: `<measurement>.<field>`, therefore, simply putting a `*` for field is sufficient, e.g. `storage.*` + * not following the `<measurement>.*` format will lead to an invalid alert trigger definition + * **aggregation_method** is not required in this event type, adding it would lead to an invalid alert trigger definition + * **comparison operator** is not required in this event type, deadman alert will use `<=` by default + * the default behaviour of this event type is to check for missing measurements, however this behaviour can be changed by providing a comparison operator in the definition, for example: + + ```yaml + excessive_storage_measurements: + description: This event triggers when the number of storage measurements reported is more than the threshold value. + event_type: deadman + metric: storage.* + condition: + # triggers when the number of reported storage measurement points for 60 seconds is at least 100 + threshold: 100 + granularity: 60 + resource_type: + flame_sfp: storage + comparison_operator: gte # explicitly setting the comparison operator to >= rather than the default <= + action: + implementation: + - flame_sfemc + ``` ##### Alert messages diff --git a/docs/clmc-service.md b/docs/clmc-service.md index ef4b1ee2f63c4f615df857b10e78368f257a2d5b..02f7f7f7a7afba4df29d629ca8e42820e928365c 100644 --- a/docs/clmc-service.md +++ b/docs/clmc-service.md @@ -425,7 +425,7 @@ with **/clmc-service** so that the nginx reverse proxy server (listening on port * Request Body Example: ```json { - "query_period": "30", + "query_period": 30, "results_measurement_name": "round_trip_time_measurement", "service_function_chain": "MSDemo", "service_function_chain_instance": "MSDemo_1", diff --git a/scripts/clmc-service/install-tick-stack.sh b/scripts/clmc-service/install-tick-stack.sh index a5d364e8d8ab247884fdc55b81fcdfe727eaf331..e5f740bc49190e4a8eebbf09410d2f64438bea52 100755 --- a/scripts/clmc-service/install-tick-stack.sh +++ b/scripts/clmc-service/install-tick-stack.sh @@ -91,5 +91,9 @@ for template_file in *.tick; do template_id=$(basename ${template_file} .tick) echo ${template_id} ${template_file} kapacitor define-template ${template_id} -tick ${template_file} + if [[ $? -ne 0 ]]; then + echo "TICKScript template ${template_file} couldn't be created" + exit 1 + fi done diff --git a/src/service/VERSION b/src/service/VERSION index 59124ba3ed3e948992d5cf837750ed347b4f46db..ae058b3516610f17300d78d120ed958d5dd1e988 100644 --- a/src/service/VERSION +++ b/src/service/VERSION @@ -1 +1 @@ -__version__ = "2.3.0" \ No newline at end of file +__version__ = "2.3.1" \ No newline at end of file diff --git a/src/service/clmcservice/__init__.py b/src/service/clmcservice/__init__.py index d9d6410d905dcabb8b7dd778eb72383408852f85..0f2b6a7d323004bdb88ddbf88f0af5785ef688a6 100644 --- a/src/service/clmcservice/__init__.py +++ b/src/service/clmcservice/__init__.py @@ -56,7 +56,7 @@ def main(global_config, **settings): settings['kapacitor_port'] = int(settings['kapacitor_port']) # the kapacitor port setting must be converted to integer instead of a string settings['clmc_service_port'] = int(settings['clmc_service_port']) # the clmc service port setting must be converted to integer instead of a string - settings["network_bandwidth"] = int(settings["network_bandwidth"]) # TODO currently assumed fixed bandwidth across all links + settings["network_bandwidth"] = int(settings["network_bandwidth"]) # this is currently not used in the graph RTT calculation model config = Configurator(settings=settings) diff --git a/src/service/clmcservice/alertsapi/alerts_specification_schema.py b/src/service/clmcservice/alertsapi/alerts_specification_schema.py index 7d15056f5e3183536a765b2c67df52ba82ac024a..7b48d43e0aabe10ab1d8b8b3dc52895221b309d9 100644 --- a/src/service/clmcservice/alertsapi/alerts_specification_schema.py +++ b/src/service/clmcservice/alertsapi/alerts_specification_schema.py @@ -64,23 +64,78 @@ URL_REGEX = re.compile( r'(?:[/?#][^\s]*)?$', # URL path or query parameters re.IGNORECASE) -# Global tags allowed to be used for filtering in the trigger condition - removed the restriction, so that all tags could be used for filtering (not just global) +# Removed the restriction below so that all tags could be used for filtering (not just global) +# Global tags allowed to be used for filtering in the trigger condition # CLMC_INFORMATION_MODEL_GLOBAL_TAGS = {"flame_sfp", "flame_sf", "flame_server", "flame_location"} - - # NOTICE that "flame_sfc", "flame_sfci" are not allowed, even though they are part of the CLMC Information Model # This is because those two tags are automatically added to the InfluxDB queries - the required values are retrieved from the alert spec. metadata # "flame_sfe" cannot be used as well, because the value of this tag is only known at runtime. +INVALID_TAGS = {"flame_sfc", "flame_sfci"} # these tags cannot be used as filters for an alert since filter rules for these are autogenerated when configuring the alert + +HANDLERS = { + "implementation": + [ + Or(SFEMC, And(str, lambda s: URL_REGEX.match(s) is not None)) + ] +} + +THRESHOLD_TRIGGER = { + Optional("description"): str, + "event_type": "threshold", + "metric": And(str, lambda s: len(s.split('.', 1)) == 2), + "condition": { + "threshold": Or(int, float), + "granularity": And(int, lambda p: p > 0), + Optional("aggregation_method"): And(str, lambda s: s in INFLUX_QL_FUNCTIONS), # defaults to "mean" + Optional("resource_type"): { + And(str, lambda tag: tag not in INVALID_TAGS): str + }, + "comparison_operator": And(str, lambda s: s in COMPARISON_OPERATORS) + }, + "action": HANDLERS +} + +RELATIVE_TRIGGER = { + Optional("description"): str, + Optional("metadata"): { + "percentage_evaluation": Or(True, False) # percentage_evaluation flag, only used for a relative alert + }, + "event_type": "relative", + "metric": And(str, lambda s: len(s.split('.', 1)) == 2), + "condition": { + "threshold": Or(int, float), + "granularity": And(int, lambda p: p > 0), + "aggregation_method": And(str, lambda s: s in INFLUX_QL_FUNCTIONS), + Optional("resource_type"): { + And(str, lambda tag: tag not in INVALID_TAGS): str + }, + "comparison_operator": And(str, lambda s: s in COMPARISON_OPERATORS) + }, + "action": HANDLERS +} + +DEADMAN_TRIGGER = { + Optional("description"): str, + "event_type": "deadman", + "metric": And(str, lambda s: len(s.split('.', 1)) == 2 and s.split('.')[1] == "*"), # format is `<measurement>.*` + "condition": { + "threshold": Or(int, float), + "granularity": And(int, lambda p: p > 0), + Optional("resource_type"): { + And(str, lambda tag: tag not in INVALID_TAGS): str + }, + Optional("comparison_operator"): And(str, lambda s: s in COMPARISON_OPERATORS) # optional comparison operator to override default deadman behaviour + }, + "action": HANDLERS +} + ALERTS_SPECIFICATION_SCHEMA = Schema({ - "tosca_definitions_version": And(str, lambda v: v == "tosca_simple_profile_for_nfv_1_0_0"), + "tosca_definitions_version": "tosca_simple_profile_for_nfv_1_0_0", Optional("description"): str, "imports": And([lambda s: s.endswith("flame_clmc_alerts_definitions.yaml")], lambda l: len(l) == 1), "metadata": { "servicefunctionchain": str - # TODO next release - uncomment - # "sfc": str, - # "sfci": str }, "topology_template": { "policies": [ @@ -88,27 +143,8 @@ ALERTS_SPECIFICATION_SCHEMA = Schema({ str: { "type": "eu.ict-flame.policies.Alert", "triggers": And({ - str: { - Optional("description"): str, - "event_type": And(str, lambda s: s in TICK_SCRIPT_TEMPLATES), - "metric": And(str, lambda s: len(s.split('.', 1)) == 2), - "condition": { - "threshold": Or(int, float), - "granularity": And(int, lambda p: p > 0), - Optional("aggregation_method"): And(str, lambda s: s in INFLUX_QL_FUNCTIONS), - Optional("resource_type"): { - str: str - }, - Optional("comparison_operator"): And(str, lambda s: s in COMPARISON_OPERATORS) - }, - "action": { - "implementation": - [ - Or(SFEMC, And(str, lambda s: URL_REGEX.match(s) is not None)) - ] - } - } - }, lambda l: len(l) > 0) + str: Or(THRESHOLD_TRIGGER, RELATIVE_TRIGGER, DEADMAN_TRIGGER) + }, lambda triggers: len(triggers) > 0) } } ] diff --git a/src/service/clmcservice/alertsapi/tests.py b/src/service/clmcservice/alertsapi/tests.py index ef481158a18177a10afb2aa8357ccfb5c1515542..c77b74bb0c8b66e819107cece0f86042bafa1868 100644 --- a/src/service/clmcservice/alertsapi/tests.py +++ b/src/service/clmcservice/alertsapi/tests.py @@ -605,8 +605,7 @@ def extract_alert_configuration_data(alert_spec, sfemc_fqdn, sfemc_port): yaml_alert_spec = load(alert_spec) adjust_tosca_definitions_import(yaml_alert_spec) tosca_tpl = ToscaTemplate(yaml_dict_tpl=yaml_alert_spec) - # TODO next release - uncomment - # sfc, sfc_instance = tosca_tpl.tpl["metadata"]["sfc"], tosca_tpl.tpl["metadata"]["sfci"] + sfc, sfc_instance = tosca_tpl.tpl["metadata"]["servicefunctionchain"], "{0}_1".format(tosca_tpl.tpl["metadata"]["servicefunctionchain"]) alerts = [] # saves every alert object diff --git a/src/service/clmcservice/alertsapi/utilities.py b/src/service/clmcservice/alertsapi/utilities.py index d0fb954436b8594fdc147b08a8e30a704d3bc01d..427a64c668b7f62c62e64fb7e6481554462c41f0 100644 --- a/src/service/clmcservice/alertsapi/utilities.py +++ b/src/service/clmcservice/alertsapi/utilities.py @@ -70,8 +70,7 @@ def get_resource_spec_policy_triggers(resource_spec_reference): resource_spec = load(resource_spec_reference.file) policy_trigger_ids = {} - # TODO next release - uncomment - # sfc, sfc_i = resource_spec["metadata"]["sfc"], resource_spec["metadata"]["sfci"] + sfc = resource_spec["metadata"]["servicefunctionchain"] sfc_i = "{0}_1".format(sfc) @@ -145,7 +144,9 @@ class TICKScriptTemplateFiller: @staticmethod def fill_template_vars(template_type, **kwargs): """ - A utility function acting as an entry poiny to the fill_<template_type>_template_vars() functions defined below. + A utility function acting as an entry point to the _fill_<template_type>_template_vars() functions defined below. Each of the functions defined below + have the arguments it requires set as None and also additional **kwargs entry. This allows us to pass all arguments to the appropriate function and let + it pick up those it needs. :param template_type: the template type - e.g. :param kwargs: keyword arguments to forward to the actual function that will be used @@ -178,7 +179,9 @@ class TICKScriptTemplateFiller: :return: a dictionary object ready to be posted to kapacitor to create a "threshold" task from template. """ - comparison_lambda = '"real_value" {0} {1}'.format(comparison_operator, critical_value) # build up lambda string, e.g. "real_value" >= 10 + aggregation_field_name = "real_value" + + comparison_lambda = '"{0}" {1} {2}'.format(aggregation_field_name, comparison_operator, critical_value) # build up lambda string, e.g. "real_value" >= 10 template_vars = { "sfc": { @@ -209,6 +212,10 @@ class TICKScriptTemplateFiller: "type": "string", "value": influx_function }, + "aggregatedFieldName": { + "type": "string", + "value": aggregation_field_name + }, "comparisonLambda": { "type": "lambda", "value": comparison_lambda @@ -303,7 +310,7 @@ class TICKScriptTemplateFiller: @staticmethod def _fill_relative_template_vars(sfc=None, sfci=None, policy=None, db=None, measurement=None, field=None, influx_function=None, critical_value=None, comparison_operator=None, - alert_period=None, topic_id=None, event_id=None, where_clause=None, **kwargs): + alert_period=None, topic_id=None, event_id=None, where_clause=None, metadata=None, **kwargs): """ Creates a dictionary object ready to be posted to kapacitor to create a "relative" task from template. @@ -317,11 +324,28 @@ class TICKScriptTemplateFiller: :param topic_id: topic identifier :param event_id: event identifier :param where_clause: (OPTIONAL) argument for filtering the influx query by tag values + :param metadata: (OPTIONAL) argument to request a percentage evaluation of the difference between the current and the past value :return: a dictionary object ready to be posted to kapacitor to create a "relative" task from template. """ - comparison_lambda = '"diff" {0} {1}'.format(comparison_operator, critical_value) + current_prefix = "current" + + past_prefix = "past" + + aggregation_field_name = "value" + + difference_field_name = "diff" + + # check if the percentage_evaluation flag was set (defaults to False, hence not set) + if metadata.get("percentage_evaluation", False): + # if percentage evaluation is requested, use a different evaluation lambda + evaluation_lambda = '100.0 * (float("{1}.{0}") - float("{2}.{0}")) / float("{2}.{0}")'.format(aggregation_field_name, current_prefix, past_prefix) + else: + # otherwise, use the simple diff evaluation lambda + evaluation_lambda = 'float("{1}.{0}") - float("{2}.{0}")'.format(aggregation_field_name, current_prefix, past_prefix) + + comparison_lambda = '"{0}" {1} {2}'.format(difference_field_name, comparison_operator, critical_value) template_vars = { "sfc": { @@ -352,6 +376,26 @@ class TICKScriptTemplateFiller: "type": "string", "value": influx_function }, + "aggregatedFieldName": { + "type": "string", + "value": aggregation_field_name + }, + "differenceFieldName": { + "type": "string", + "value": difference_field_name + }, + "currentPrefix": { + "type": "string", + "value": current_prefix + }, + "pastPrefix": { + "type": "string", + "value": past_prefix + }, + "evaluationLambda": { + "type": "lambda", + "value": evaluation_lambda + }, "comparisonLambda": { "type": "lambda", "value": comparison_lambda @@ -379,7 +423,8 @@ class TICKScriptTemplateFiller: return template_vars @staticmethod - def _fill_deadman_template_vars(sfc=None, sfci=None, policy=None, db=None, measurement=None, critical_value=None, alert_period=None, topic_id=None, event_id=None, where_clause=None, **kwargs): + def _fill_deadman_template_vars(sfc=None, sfci=None, policy=None, db=None, measurement=None, critical_value=None, comparison_operator=None, + alert_period=None, topic_id=None, event_id=None, where_clause=None, **kwargs): """ Creates a dictionary object ready to be posted to kapacitor to create a "deadman" task from template. @@ -394,6 +439,13 @@ class TICKScriptTemplateFiller: :return: a dictionary object ready to be posted to kapacitor to create a "deadman" task from template. """ + # default comparison operator is less than or equal to (default behavior of a deadman alert is for alerting on missing measurements) + if comparison_operator is None: + comparison_operator = "<=" + + # emitted is a field that comes from a built-in kapacitor function, hence it cannot be parameterised + comparison_lambda = '"emitted" {0} {1}'.format(comparison_operator, critical_value) + template_vars = { "sfc": { "type": "string", @@ -415,6 +467,10 @@ class TICKScriptTemplateFiller: "type": "string", "value": measurement }, + "comparisonLambda": { + "type": "lambda", + "value": comparison_lambda + }, "alertPeriod": { "type": "duration", "value": alert_period diff --git a/src/service/clmcservice/alertsapi/views.py b/src/service/clmcservice/alertsapi/views.py index f2bb1e609e44d4094d96d558f6c0a8a3c6a05b19..4ddbc58ce1f4b10a2858a90145188960f4221853 100644 --- a/src/service/clmcservice/alertsapi/views.py +++ b/src/service/clmcservice/alertsapi/views.py @@ -48,7 +48,7 @@ class AlertsConfigurationAPI(object): A class-based view for configuring alerts within CLMC. """ - STREAM_PERIOD_LIMIT = 60 # if alert period is <= 60 seconds, then a stream template is used, otherwise use batch + STREAM_PERIOD_LIMIT = 60 # if alert period is less than 60 seconds, then a stream template is used, otherwise use batch DUAL_VERSION_TEMPLATES = {"threshold"} # this set defines all template types that are written in two versions (stream and batch) @@ -335,7 +335,9 @@ class AlertsConfigurationAPI(object): kapacitor_api_topic_handlers_url = "http://{0}:{1}{2}/{3}/handlers".format(kapacitor_host, kapacitor_port, self.KAPACITOR_TOPIC_API_PREFIX, topic_id) http_response = get(kapacitor_api_topic_handlers_url) if http_response.status_code != 200: - continue # if the topic doesn't exist continue with the other triggers + log.error("Unexpected status code returned when fetching the list of handlers, url - {0}, status - {1}, response - {2}".format(kapacitor_api_topic_handlers_url, + http_response.status_code, http_response.text)) + continue # if for some reason a status different than 200 is returned, skip this topic (we do not expect this to happen) # delete alert handlers http_handlers = http_response.json()['handlers'] @@ -501,6 +503,10 @@ class AlertsConfigurationAPI(object): policy_id = policy.name resource_spec_trigger_id = resource_spec_policy_triggers.get("{0}\n{1}".format(policy_id, event_id)) + # get metadata defined for the trigger, defaults to empty key-value mapping + # the metadata is only useful for the relative alert template - can be used to set the diff evaluation to percentage difference + trigger_metadata = trigger.trigger_tpl.get("metadata", {}) + event_type = trigger.trigger_tpl["event_type"] template_id = "{0}-template".format(event_type) measurement, field = trigger.trigger_tpl["metric"].split(".") @@ -523,7 +529,7 @@ class AlertsConfigurationAPI(object): where_filters_list = map(lambda tag_name: '"{0}"=\'{1}\''.format(tag_name, tags[tag_name]), tags) where_clause = " AND ".join(where_filters_list) - comparison_operator = COMPARISON_OPERATORS[condition.get("comparison_operator", "gte")] # if not specified, use "gte" (>=) + comparison_operator = COMPARISON_OPERATORS.get(condition.get("comparison_operator")) # if not specified, the comparison operator will be set to None # generate topic and alert identifiers topic_id = "{0}\n{1}\n{2}\n{3}".format(sfc, sfc_instance, policy_id, event_id) # scoped per service function chain instance (no two sfc instances report to the same topic) @@ -543,7 +549,7 @@ class AlertsConfigurationAPI(object): # all extracted properties from the trigger are passed, the TICKScriptTemplateFiller entry point then forwards those to the appropriate function for template filling template_vars = TICKScriptTemplateFiller.fill_template_vars(event_type, sfc=sfc, sfci=sfc_instance, policy=policy_id, db=db, measurement=measurement, field=field, influx_function=influx_function, critical_value=critical_value, comparison_operator=comparison_operator, - alert_period=alert_period, topic_id=topic_id, event_id=event_id, where_clause=where_clause) + alert_period=alert_period, topic_id=topic_id, event_id=event_id, where_clause=where_clause, metadata=trigger_metadata) # create and activate alert task through the kapacitor HTTP API kapacitor_http_request_body = { diff --git a/src/service/requirements.txt b/src/service/requirements.txt index 9cfd29044267440e933889e83ee07608de46ad51..9ad70ffe18fb8716354429dda872224fb9b34540 100644 --- a/src/service/requirements.txt +++ b/src/service/requirements.txt @@ -1,65 +1,72 @@ +MarkupSafe==1.1.1 PasteDeploy==2.0.1 -schema==0.6.8 -psycopg2==2.7.5 -psutil==5.6.1 +PrettyTable==0.7.2 +PyYAML==3.13 +Pygments==2.3.1 +SQLAlchemy==1.2.12 +WebOb==1.8.5 +asn1crypto==0.24.0 +atomicwrites==1.3.0 +attrs==19.1.0 +certifi==2019.3.9 +chardet==3.0.4 +click==7.0 colorama==0.4.1 +filelock==3.0.10 +hupper==1.6.1 +idna==2.8 +ipaddress==1.0.22 +more-itertools==7.0.0 +neobolt==1.7.12 pbr==5.2.0 pluggy==0.11.0 -prettytable==0.7.2 -neobolt==1.7.12 -more-itertools==7.0.0 +psutil==5.6.1 +psycopg2==2.7.5 py==1.8.0 -certifi==2019.3.9 -attrs==19.1.0 -WebOb==1.8.5 -atomicwrites==1.3.0 -waitress==1.1.0 -Click==7.0 pycparser==2.19 -PyYAML==3.13 -asn1crypto==0.24.0 -SQLAlchemy==1.2.12 -urllib3==1.24.3 -ipaddress==1.0.22 -six==1.12.0 -wcwidth==0.1.7 +pyparsing==2.4.0 pyperclip==1.7.0 -hupper==1.6.1 -chardet==3.0.4 -toml==0.10.0 pytz==2019.1 -filelock==3.0.10 -MarkupSafe==1.1.1 -translationstring==1.3 -Pygments==2.3.1 -idna==2.8 -pyparsing==2.4.0 repoze.lru==0.7 +schema==0.6.8 +setuptools==41.0.1 +six==1.12.0 +toml==0.10.0 +translationstring==1.3 +urllib3==1.24.3 venusian==1.2.0 -zope.deprecation==4.4.0 -stevedore==1.30.1 +waitress==1.1.0 +wcwidth==0.1.7 +wheel==0.33.4 + +Babel==2.6.0 Mako==1.0.9 cffi==1.12.3 -zope.interface==4.6.0 -neotime==1.7.4 -Babel==2.6.0 -python-dateutil==2.8.0 -plaster==1.0 cmd2==0.9.12 -requests==2.21.0 +neotime==1.7.4 pathlib2==2.3.3 +plaster==1.0 prompt-toolkit==2.0.9 -pytest==3.8.1 -cryptography==2.6.1 -transaction==2.4.0 +python-dateutil==2.8.0 +requests==2.21.0 +stevedore==1.30.1 +zope.deprecation==4.4.0 +zope.interface==4.6.0 + cliff==2.14.1 -plaster-pastedeploy==0.6 +cryptography==2.6.1 influxdb==5.2.0 +plaster-pastedeploy==0.6 py2neo==4.2.0 -zope.sqlalchemy==1.0 +pytest==3.8.1 +transaction==2.4.0 + pyOpenSSL==19.0.0 -tosca-parser==1.1.0 pyramid==1.9.2 -pyramid-mako==1.0.2 +tosca-parser==1.1.0 +zope.sqlalchemy==1.0 + pyramid-exclog==1.0 +pyramid-mako==1.0.2 + pyramid-debugtoolbar==4.5 \ No newline at end of file diff --git a/src/service/resources/TICKscript/deadman-template.tick b/src/service/resources/TICKscript/deadman-template.tick index ae6d6859ebf144fc88ea970cbd3d1be9f885db45..21a0e89c3fe8f150bc3be1e3da4d7951901549d8 100644 --- a/src/service/resources/TICKscript/deadman-template.tick +++ b/src/service/resources/TICKscript/deadman-template.tick @@ -12,11 +12,13 @@ var measurement string var whereClause = lambda: TRUE // default value is a function which returns TRUE, hence no filtering of the query result -var messageValue = 'TRUE' // default value is TRUE, as this is what SFEMC expects as a notification for an event rule +var messageValue = 'TRUE' // default value is TRUE, this field could be used to convey futher information for the alert + +var comparisonLambda lambda // comparison function e.g. "emitted" <= 40.0 var alertPeriod duration -var throughputThreshold float // alerts will trigger if data points reported durign the alert period fall bellow this value +var throughputThreshold float // alerts will trigger if data points reported during the alert period fall bellow this value var topicID string @@ -29,9 +31,15 @@ stream .retentionPolicy(rp) .measurement(measurement) .where(whereClause) - | deadman(throughputThreshold, alertPeriod) + | stats(alertPeriod) + .align() + | derivative('emitted') + .unit(alertPeriod) + .nonNegative() + | alert() .id(eventID) .details('db=' + db + ',sfc=' + sfc + ',sfci=' + sfci + ',policy=' + policy) + .crit(comparisonLambda) .message(messageValue) .topic(topicID) .noRecoveries() diff --git a/src/service/resources/TICKscript/relative-template.tick b/src/service/resources/TICKscript/relative-template.tick index 1b27df7935ba4beef35190389e3d7fcb6d521c55..2accc00eba4d82cdf76fdd264a468cb5e32ff1f9 100644 --- a/src/service/resources/TICKscript/relative-template.tick +++ b/src/service/resources/TICKscript/relative-template.tick @@ -14,9 +14,15 @@ var field string var influxFunction string +var aggregatedFieldName string + +var differenceFieldName string + var whereClause = 'TRUE' // default value is TRUE, hence no filtering of the query result -var messageValue = 'TRUE' // default value is TRUE, as this is what SFEMC expects as a notification for an event rule +var messageValue = 'TRUE' // default value is TRUE, this field could be used to convey futher information for the alert + +var evaluationLambda lambda // evaluation function to get the difference between current and past value, e.g. float("current.value") - float("past.value") var comparisonLambda lambda // comparison function e.g. "diff" > 40 @@ -26,15 +32,19 @@ var topicID string var eventID string // topicID is based on the event ID, but represents a hash value +// these two variables can be modified when creating an alert based on this template, default value are given so that Kapacitor successfully validates the TICK script +var pastPrefix = 'past' +var currentPrefix = 'current' + var current = batch - |query('SELECT ' + influxFunction + '(' + field + ') AS value FROM "' + db + '"."' + rp + '"."' + measurement + '" WHERE ' + whereClause) + |query('SELECT ' + influxFunction + '(' + field + ') AS ' + aggregatedFieldName + ' FROM "' + db + '"."' + rp + '"."' + measurement + '" WHERE ' + whereClause) .period(alertPeriod) .every(alertPeriod) .align() var past = batch - |query('SELECT ' + influxFunction + '(' + field + ') AS value FROM "' + db + '"."' + rp + '"."' + measurement + '" WHERE ' + whereClause) + |query('SELECT ' + influxFunction + '(' + field + ') AS ' + aggregatedFieldName + ' FROM "' + db + '"."' + rp + '"."' + measurement + '" WHERE ' + whereClause) .period(alertPeriod) .every(alertPeriod) .offset(alertPeriod) @@ -43,9 +53,9 @@ var past = batch past | join(current) - .as('past', 'current') - | eval(lambda: float("current.value" - "past.value")) - .as('diff') + .as(pastPrefix, currentPrefix) + | eval(evaluationLambda) + .as(differenceFieldName) | alert() .id(eventID) .details('db=' + db + ',sfc=' + sfc + ',sfci=' + sfci + ',policy=' + policy) diff --git a/src/service/resources/TICKscript/threshold-batch-template.tick b/src/service/resources/TICKscript/threshold-batch-template.tick index ed7b60357b82e2ae5064e894dfcac1b55eb8dc3d..b001cf082e91f503481311b3bad6bdf053333255 100644 --- a/src/service/resources/TICKscript/threshold-batch-template.tick +++ b/src/service/resources/TICKscript/threshold-batch-template.tick @@ -14,9 +14,11 @@ var field string var influxFunction string +var aggregatedFieldName string + var whereClause = 'TRUE' // default value is TRUE, hence no filtering of the query result -var messageValue = 'TRUE' // default value is TRUE, as this is what SFEMC expects as a notification for an event rule +var messageValue = 'TRUE' // default value is TRUE, this field could be used to convey futher information for the alert var comparisonLambda lambda // comparison function e.g. "real_value" > 40 @@ -27,7 +29,7 @@ var topicID string var eventID string // topicID is based on the event ID, but represents a hash value batch - |query('SELECT ' + influxFunction + '(' + field + ') AS real_value FROM "' + db + '"."' + rp + '"."' + measurement + '" WHERE ' + whereClause) + |query('SELECT ' + influxFunction + '(' + field + ') AS ' + aggregatedFieldName + ' FROM "' + db + '"."' + rp + '"."' + measurement + '" WHERE ' + whereClause) .period(alertPeriod) .every(alertPeriod) |alert() diff --git a/src/service/resources/TICKscript/threshold-stream-template.tick b/src/service/resources/TICKscript/threshold-stream-template.tick index 68b50f353cd69fce20db3cf6ed7e7b04712bacc1..0e8619b8073de33c4cdc6ac12230b0923df4b071 100644 --- a/src/service/resources/TICKscript/threshold-stream-template.tick +++ b/src/service/resources/TICKscript/threshold-stream-template.tick @@ -12,7 +12,7 @@ var measurement string var whereClause = lambda: TRUE // default value is a function which returns TRUE, hence no filtering of the query result -var messageValue = 'TRUE' // default value is TRUE, as this is what SFEMC expects as a notification for an event rule +var messageValue = 'TRUE' // default value is TRUE, this field could be used to convey futher information for the alert var comparisonLambda lambda // comparison function e.g. "real_value" > 40 diff --git a/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-13.yaml b/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-13.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cb3ff637b7288346b27ab0f1af749d959e0057cf --- /dev/null +++ b/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-13.yaml @@ -0,0 +1,74 @@ +tosca_definitions_version: tosca_simple_profile_for_nfv_1_0_0 + +description: TOSCA Alerts Configuration document + +imports: +- flame_clmc_alerts_definitions.yaml + +metadata: + servicefunctionchain: companyA-VR + +topology_template: + + policies: + - high_latency_policy: + type: eu.ict-flame.policies.Alert + triggers: + high_latency: + description: This event triggers when the mean network latency in a given location exceeds a given threshold (in ms). + metadata: # empty metadata is not allowed + event_type: threshold + metric: network.latency + condition: + threshold: 45 + granularity: 120 + aggregation_method: mean + resource_type: + flame_location: watershed + comparison_operator: gt + action: + implementation: + - flame_sfemc + - http://companyA.alert-handler.flame.eu/high-latency + - requests_diff_policy: + type: eu.ict-flame.policies.Alert + triggers: + decrease_in_requests: + description: | + This event triggers when the number of requests has decreased relative to the number of requests received + 120 seconds ago. + event_type: relative + metric: storage.requests + condition: + threshold: -100 + granularity: 120 + resource_type: + flame_sfp: storage + flame_sf: storage-users + flame_location: watershed + comparison_operator: lte + action: + implementation: + - http://sfemc.flame.eu/notify + - low_requests_policy: + type: eu.ict-flame.policies.Alert + triggers: + low_requests: + description: | + This event triggers when the last reported number of requests for a given service function + falls behind a given threshold. + event_type: threshold + metric: storage.requests + condition: + threshold: 5 + granularity: 60 + aggregation_method: last + resource_type: + flame_sfp: storage + flame_sf: storage-users + flame_location: watershed + comparison_operator: lt + action: + implementation: + - http://sfemc.flame.eu/notify + - http://companyA.alert-handler.flame.eu/low-requests diff --git a/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-14.yaml b/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-14.yaml new file mode 100644 index 0000000000000000000000000000000000000000..313203b31eb07090047af168f4688127982ac518 --- /dev/null +++ b/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-14.yaml @@ -0,0 +1,75 @@ +tosca_definitions_version: tosca_simple_profile_for_nfv_1_0_0 + +description: TOSCA Alerts Configuration document + +imports: +- flame_clmc_alerts_definitions.yaml + +metadata: + servicefunctionchain: companyA-VR + +topology_template: + + policies: + - high_latency_policy: + type: eu.ict-flame.policies.Alert + triggers: + high_latency: + description: This event triggers when the mean network latency in a given location exceeds a given threshold (in ms). + event_type: threshold + metric: network.latency + condition: + threshold: 45 + granularity: 120 + aggregation_method: mean + resource_type: + flame_location: watershed + comparison_operator: gt + action: + implementation: + - flame_sfemc + - http://companyA.alert-handler.flame.eu/high-latency + - requests_diff_policy: + type: eu.ict-flame.policies.Alert + triggers: + decrease_in_requests: + description: | + This event triggers when the number of requests has decreased relative to the number of requests received + 120 seconds ago. + metadata: + percentage_comparison: true # correct flag is percentage_evaluation + event_type: relative + metric: storage.requests + condition: + threshold: 5 + granularity: 120 + resource_type: + flame_sfp: storage + flame_sf: storage-users + flame_location: watershed + comparison_operator: lte + action: + implementation: + - http://sfemc.flame.eu/notify + - low_requests_policy: + type: eu.ict-flame.policies.Alert + triggers: + low_requests: + description: | + This event triggers when the last reported number of requests for a given service function + falls behind a given threshold. + event_type: threshold + metric: storage.requests + condition: + threshold: 5 + granularity: 60 + aggregation_method: last + resource_type: + flame_sfp: storage + flame_sf: storage-users + flame_location: watershed + comparison_operator: lt + action: + implementation: + - http://sfemc.flame.eu/notify + - http://companyA.alert-handler.flame.eu/low-requests diff --git a/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-15.yaml b/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-15.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c09719754cfb952f8dcb5125796d9ae564588bfd --- /dev/null +++ b/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-15.yaml @@ -0,0 +1,75 @@ +tosca_definitions_version: tosca_simple_profile_for_nfv_1_0_0 + +description: TOSCA Alerts Configuration document + +imports: +- flame_clmc_alerts_definitions.yaml + +metadata: + servicefunctionchain: companyA-VR + +topology_template: + + policies: + - high_latency_policy: + type: eu.ict-flame.policies.Alert + triggers: + high_latency: + description: This event triggers when the mean network latency in a given location exceeds a given threshold (in ms). + event_type: threshold + metric: network.latency + condition: + threshold: 45 + granularity: 120 + aggregation_method: mean + resource_type: + flame_location: watershed + comparison_operator: gt + action: + implementation: + - flame_sfemc + - http://companyA.alert-handler.flame.eu/high-latency + - requests_diff_policy: + type: eu.ict-flame.policies.Alert + triggers: + decrease_in_requests: + description: | + This event triggers when the number of requests has decreased relative to the number of requests received + 120 seconds ago. + metadata: + percentage_evaluation: percentage # only True or False values allowed for this flag + event_type: relative + metric: storage.requests + condition: + threshold: 5 + granularity: 120 + resource_type: + flame_sfp: storage + flame_sf: storage-users + flame_location: watershed + comparison_operator: lte + action: + implementation: + - http://sfemc.flame.eu/notify + - low_requests_policy: + type: eu.ict-flame.policies.Alert + triggers: + low_requests: + description: | + This event triggers when the last reported number of requests for a given service function + falls behind a given threshold. + event_type: threshold + metric: storage.requests + condition: + threshold: 5 + granularity: 60 + aggregation_method: last + resource_type: + flame_sfp: storage + flame_sf: storage-users + flame_location: watershed + comparison_operator: lt + action: + implementation: + - http://sfemc.flame.eu/notify + - http://companyA.alert-handler.flame.eu/low-requests diff --git a/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-16.yaml b/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f3026f86f11e0565ad99ca0bfd0ec84891256cba --- /dev/null +++ b/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-16.yaml @@ -0,0 +1,36 @@ +tosca_definitions_version: tosca_simple_profile_for_nfv_1_0_0 + +description: TOSCA Alerts Configuration document + +imports: +- flame_clmc_alerts_definitions.yaml + +metadata: + servicefunctionchain: companyA-VR + +topology_template: + + policies: + - requests_diff_policy: + type: eu.ict-flame.policies.Alert + triggers: + decrease_in_requests: + description: | + This event triggers when the number of requests has decreased relative to the number of requests received + 120 seconds ago. + metadata: + percentage_evaluation: False + event_type: relative + metric: storage.requests + condition: + threshold: 5 + granularity: 120 + resource_type: + flame_sfp: storage + flame_sf: storage-users + flame_location: watershed + flame_sfc: storage-sfc # flame_sfc tag is not allowed (autogenerated by CLMC) + comparison_operator: lte + action: + implementation: + - http://sfemc.flame.eu/notify diff --git a/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-17.yaml b/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-17.yaml new file mode 100644 index 0000000000000000000000000000000000000000..966a2f8fc35bb170871e4ba4e747b4e570607a5a --- /dev/null +++ b/src/service/resources/tosca/test-data/clmc-validator/invalid/alerts_test_config-17.yaml @@ -0,0 +1,32 @@ +tosca_definitions_version: tosca_simple_profile_for_nfv_1_0_0 + +description: TOSCA Alerts Configuration document + +imports: +- flame_clmc_alerts_definitions.yaml + +metadata: + servicefunctionchain: companyA-VR + +topology_template: + + policies: + - high_latency_policy: + type: eu.ict-flame.policies.Alert + triggers: + high_latency: + description: This event triggers when the mean network latency in a given location exceeds a given threshold (in ms). + event_type: threshold + metric: network.latency + condition: + threshold: 45 + granularity: 120 + aggregation_method: mean + resource_type: + flame_location: watershed + flame_sfci: storage-sfci # flame_sfci tag is not allowed (autogenerated by CLMC) + comparison_operator: gt + action: + implementation: + - flame_sfemc + - http://companyA.alert-handler.flame.eu/high-latency diff --git a/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-1.yaml b/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-1.yaml index 6db6518e46b29b9f53d1f09f8f6ce5a43669d9be..750a0740178cc4f8f1d69f3efdd817f419466f26 100644 --- a/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-1.yaml +++ b/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-1.yaml @@ -7,7 +7,6 @@ imports: metadata: servicefunctionchain: companyA-VR -# sfci: companyA-VR-premium topology_template: @@ -16,8 +15,10 @@ topology_template: type: eu.ict-flame.policies.Alert triggers: high_latency: - description: This event triggers when the mean network latency in a given location exceeds a given threshold (in ms). - event_type: threshold + description: This event triggers when the mean network latency in a given location has increased by a given threshold (in ms). + metadata: + percentage_evaluation: False # explicitly setting the percentage flag to false is also allowed + event_type: relative metric: network.latency condition: threshold: 45 diff --git a/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-2.yaml b/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-2.yaml index ee3de7a27e647c208dab43f3dff920ba418eae20..47007c7b1ae8d801c4da524b5b7a8892c26f0659 100644 --- a/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-2.yaml +++ b/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-2.yaml @@ -7,7 +7,6 @@ imports: metadata: servicefunctionchain: companyA-VR -# sfci: companyA-VR-premium topology_template: @@ -35,11 +34,14 @@ topology_template: description: | This event triggers when the number of requests has decreased relative to the number of requests received 120 seconds ago. + metadata: + percentage_evaluation: false event_type: relative metric: storage.requests condition: threshold: -100 # requests have decreased by at least 100 granularity: 120 + aggregation_method: mean resource_type: status_code: '200' flame_sfp: storage diff --git a/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-3.yaml b/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-3.yaml index b2bbbc28b588516bee49c3485f607a0e8e479382..27f3d72fc487f220e72c7d750d54bb148455a92e 100644 --- a/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-3.yaml +++ b/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-3.yaml @@ -7,7 +7,6 @@ imports: metadata: servicefunctionchain: companyA-VR -# sfci: companyA-VR-premium topology_template: @@ -25,7 +24,7 @@ topology_template: # aggregation_method is optional, default value is "mean" resource_type: flame_location: watershed - # comparison operator is optional, default value is >= or "gte" + comparison_operator: neq action: implementation: - flame_sfemc diff --git a/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-4.yaml b/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-4.yaml index b4f8e4a371f42ad628d626ed3f532bb9e7930852..d5ae1608c954a4bf293d89114e774eec239cea66 100644 --- a/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-4.yaml +++ b/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-4.yaml @@ -7,7 +7,6 @@ imports: metadata: servicefunctionchain: companyA-VR -# sfci: companyA-VR-premium topology_template: @@ -23,9 +22,7 @@ topology_template: threshold: 45 granularity: 45 aggregation_method: median - resource_type: - flame_location: watershed - flame_server: watershed + # resource type missing - optional, so it is valid comparison_operator: neq action: implementation: @@ -60,9 +57,7 @@ topology_template: condition: threshold: 0 # if requests are less than or equal to 0 (in other words, no measurements are reported) granularity: 60 # check for for missing data for the last 60 seconds - resource_type: - flame_sfp: storage - comparison_operator: gte # although events of type deadman do not use a comparison operator, the validator will not complain if one is given, it will simply ignore it + # resource type missing - optional, so it is valid action: implementation: - flame_sfemc diff --git a/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-5.yaml b/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-5.yaml index d88fb000f852eacbf27f1bdb195c80d635edce0e..4aa5f5140141914dfd881f78d01db2707c6d9353 100644 --- a/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-5.yaml +++ b/src/service/resources/tosca/test-data/clmc-validator/valid/alerts_test_config-5.yaml @@ -7,7 +7,6 @@ imports: metadata: servicefunctionchain: companyA-VR -# sfci: companyA-VR-premium topology_template: @@ -36,13 +35,11 @@ topology_template: low_requests: # optional description - hence, valid event_type: deadman # deadman template is allowed - metric: storage.requests + metric: storage.* condition: threshold: 5 granularity: 60 - aggregation_method: first # resource type missing - optional, so it is valid - comparison_operator: lt action: implementation: - flame_sfemc @@ -54,12 +51,14 @@ topology_template: description: | This event triggers when the number of requests has increased relative to the number of requests received 120 seconds ago. + metadata: + percentage_evaluation: true event_type: relative metric: storage.requests condition: threshold: 100 # requests have increased by at least 100 granularity: 120 - aggregation_method: first # Although events of type relative do not require an aggregation method, the validator will not complain if one is given, it will simply ignore it + aggregation_method: first resource_type: flame_sfp: storage flame_sf: storage-users diff --git a/src/test/VERSION b/src/test/VERSION index 59124ba3ed3e948992d5cf837750ed347b4f46db..ae058b3516610f17300d78d120ed958d5dd1e988 100644 --- a/src/test/VERSION +++ b/src/test/VERSION @@ -1 +1 @@ -__version__ = "2.3.0" \ No newline at end of file +__version__ = "2.3.1" \ No newline at end of file diff --git a/src/test/clmctest/alerts/alerts_test_config.yaml b/src/test/clmctest/alerts/alerts_test_config.yaml index 4d5bca9c7387996bace7a0d96f5124bcf0485c64..7bf3407128fc7e386d4e308d68b8513a047da9b0 100644 --- a/src/test/clmctest/alerts/alerts_test_config.yaml +++ b/src/test/clmctest/alerts/alerts_test_config.yaml @@ -7,7 +7,6 @@ imports: metadata: servicefunctionchain: MS_Template_1 -# sfci: MS_I1 topology_template: @@ -18,16 +17,18 @@ topology_template: high_requests: description: | This event triggers when the number of requests for a given service function - exceeds a given threshold. - event_type: threshold + has increased by the given threshold. The difference between the current and the past value is computed as percentage. + metadata: + percentage_evaluation: True + event_type: relative metric: nginx.requests condition: - threshold: 1 - granularity: 5 - aggregation_method: mean + threshold: 0 + granularity: 10 + aggregation_method: last resource_type: flame_location: DC1 - comparison_operator: gte + comparison_operator: gt action: implementation: - http://172.40.231.200:9999/ @@ -48,7 +49,7 @@ topology_template: - flame_sfemc - http://172.40.231.200:9999/ increase_in_active_requests: - description: This event triggers when the cpu system usage is too high. + description: This event triggers when the number of nginx accept requests increases. event_type: relative metric: nginx.accepts condition: @@ -72,7 +73,7 @@ topology_template: description: | This event triggers when RTT measurements are missing for more than 12 seconds. event_type: deadman - metric: clcm.rtt + metric: graph_measurement.* condition: threshold: 0 granularity: 5 @@ -84,4 +85,20 @@ topology_template: action: implementation: - flame_sfemc + - http://172.40.231.200:9999/ + service_started: + description: This event triggers when the CLMC have started receiving nginx measurements. + event_type: deadman + metric: nginx.* + condition: + threshold: 1 + granularity: 10 + resource_type: + flame_sfp: nginx + flame_sf: adaptive_streaming_nginx_I1 + flame_location: DC1 + flame_server: DC1 + comparison_operator: gte + action: + implementation: - http://172.40.231.200:9999/ \ No newline at end of file diff --git a/src/test/clmctest/alerts/test_alerts.py b/src/test/clmctest/alerts/test_alerts.py index ba7a96f1aab913862ab9391f306aa4a0a1376756..e87b8255dda0bdb4e963bc11117341ce44e4aa55 100644 --- a/src/test/clmctest/alerts/test_alerts.py +++ b/src/test/clmctest/alerts/test_alerts.py @@ -21,7 +21,9 @@ ## Created Date : 22-08-2018 ## Created for Project : FLAME """ + import datetime +import pytest from time import sleep, strptime from influxdb import InfluxDBClient @@ -38,6 +40,39 @@ SFEMC = "flame_sfemc" sfc, sfc_instance = "MS_Template_1", "MS_Template_1_1" +expected_alerts_list = [ + {"policy": "scale_nginx_policy", "trigger": "high_requests", "task_identifier": "46fb8800c8a5eeeb04b090d838d475df574a2e6d854b5d678fc981c096eb6c1b", + "handlers": ["http://172.40.231.200:9999/"], + "topic_identifier": "46fb8800c8a5eeeb04b090d838d475df574a2e6d854b5d678fc981c096eb6c1b", + "task_api_endpoint": "/kapacitor/v1/tasks/46fb8800c8a5eeeb04b090d838d475df574a2e6d854b5d678fc981c096eb6c1b", + "topic_api_endpoint": "/kapacitor/v1/alerts/topics/46fb8800c8a5eeeb04b090d838d475df574a2e6d854b5d678fc981c096eb6c1b", + "topic_handlers_api_endpoint": "/kapacitor/v1/alerts/topics/46fb8800c8a5eeeb04b090d838d475df574a2e6d854b5d678fc981c096eb6c1b/handlers"}, + {"policy": "scale_nginx_policy", "trigger": "increase_in_active_requests", "task_identifier": "7a9867f9270dba6635ac3760a3b70bc929f5bd0f3bf582e45d27fbd437f528ca", + "handlers": [SFEMC, "http://172.40.231.200:9999/"], + "topic_identifier": "7a9867f9270dba6635ac3760a3b70bc929f5bd0f3bf582e45d27fbd437f528ca", + "task_api_endpoint": "/kapacitor/v1/tasks/7a9867f9270dba6635ac3760a3b70bc929f5bd0f3bf582e45d27fbd437f528ca", + "topic_api_endpoint": "/kapacitor/v1/alerts/topics/7a9867f9270dba6635ac3760a3b70bc929f5bd0f3bf582e45d27fbd437f528ca", + "topic_handlers_api_endpoint": "/kapacitor/v1/alerts/topics/7a9867f9270dba6635ac3760a3b70bc929f5bd0f3bf582e45d27fbd437f528ca/handlers"}, + {"policy": "scale_nginx_policy", "trigger": "increase_in_running_processes", "task_identifier": "f5edaeb27fb847116be749c3815d240cbf0d7ba79aee1959daf0b3445a70f2c8", + "handlers": [SFEMC, "http://172.40.231.200:9999/"], + "topic_identifier": "f5edaeb27fb847116be749c3815d240cbf0d7ba79aee1959daf0b3445a70f2c8", + "task_api_endpoint": "/kapacitor/v1/tasks/f5edaeb27fb847116be749c3815d240cbf0d7ba79aee1959daf0b3445a70f2c8", + "topic_api_endpoint": "/kapacitor/v1/alerts/topics/f5edaeb27fb847116be749c3815d240cbf0d7ba79aee1959daf0b3445a70f2c8", + "topic_handlers_api_endpoint": "/kapacitor/v1/alerts/topics/f5edaeb27fb847116be749c3815d240cbf0d7ba79aee1959daf0b3445a70f2c8/handlers"}, + {"policy": "deadman_policy", "trigger": "no_measurements", "task_identifier": "f7dab6fd53001c812d44533d3bbb6ef45f0d1d39b9441bc3c60402ebda85d320", + "handlers": [SFEMC, "http://172.40.231.200:9999/"], + "topic_identifier": "f7dab6fd53001c812d44533d3bbb6ef45f0d1d39b9441bc3c60402ebda85d320", + "task_api_endpoint": "/kapacitor/v1/tasks/f7dab6fd53001c812d44533d3bbb6ef45f0d1d39b9441bc3c60402ebda85d320", + "topic_api_endpoint": "/kapacitor/v1/alerts/topics/f7dab6fd53001c812d44533d3bbb6ef45f0d1d39b9441bc3c60402ebda85d320", + "topic_handlers_api_endpoint": "/kapacitor/v1/alerts/topics/f7dab6fd53001c812d44533d3bbb6ef45f0d1d39b9441bc3c60402ebda85d320/handlers"}, + {"policy": "deadman_policy", "trigger": "service_started", "task_identifier": "2707cb9c0397c1aae0f831d5893aa769c6eaeb8834c974f2c14eb2c60be5bd73", + "handlers": ["http://172.40.231.200:9999/"], + "topic_identifier": "2707cb9c0397c1aae0f831d5893aa769c6eaeb8834c974f2c14eb2c60be5bd73", + "task_api_endpoint": "/kapacitor/v1/tasks/2707cb9c0397c1aae0f831d5893aa769c6eaeb8834c974f2c14eb2c60be5bd73", + "topic_api_endpoint": "/kapacitor/v1/alerts/topics/2707cb9c0397c1aae0f831d5893aa769c6eaeb8834c974f2c14eb2c60be5bd73", + "topic_handlers_api_endpoint": "/kapacitor/v1/alerts/topics/2707cb9c0397c1aae0f831d5893aa769c6eaeb8834c974f2c14eb2c60be5bd73/handlers"} +] + def is_valid_timestamp(str_timestamp): try: @@ -75,7 +110,7 @@ JSON_BODY_SCHEMA = Schema({ ], "values": [ [ - Or(str, int) + Or(str, int, float) ] ] } @@ -84,6 +119,26 @@ JSON_BODY_SCHEMA = Schema({ }) +@pytest.fixture(autouse=True) +def setup_teardown_fixture(rspec_config): + """ + This fixture is autoused for each test, and serves as a tear down functionality to clean up kapacitor resources after each test. + """ + + clmc_service_host = None + for host in rspec_config: + if host["name"] == "clmc-service": + clmc_service_host = host["ip_address"] + break + + assert clmc_service_host is not None + + yield + + # clean up the kapacitor alerts after each test + cleanup_alerts(clmc_service_host) + + class TestAlerts(object): def test_alert_triggers(self, rspec_config, set_up_tear_down_fixture): @@ -135,33 +190,6 @@ class TestAlerts(object): # check that the alerts can be fetched with a GET request print("Validate that the alerts were registered and can be fetched with a GET request.") - expected_alerts_list = [ - {"policy": "scale_nginx_policy", "trigger": "high_requests", "task_identifier": "46fb8800c8a5eeeb04b090d838d475df574a2e6d854b5d678fc981c096eb6c1b", - "handlers": ["http://172.40.231.200:9999/"], - "topic_identifier": "46fb8800c8a5eeeb04b090d838d475df574a2e6d854b5d678fc981c096eb6c1b", - "task_api_endpoint": "/kapacitor/v1/tasks/46fb8800c8a5eeeb04b090d838d475df574a2e6d854b5d678fc981c096eb6c1b", - "topic_api_endpoint": "/kapacitor/v1/alerts/topics/46fb8800c8a5eeeb04b090d838d475df574a2e6d854b5d678fc981c096eb6c1b", - "topic_handlers_api_endpoint": "/kapacitor/v1/alerts/topics/46fb8800c8a5eeeb04b090d838d475df574a2e6d854b5d678fc981c096eb6c1b/handlers"}, - {"policy": "scale_nginx_policy", "trigger": "increase_in_active_requests", "task_identifier": "7a9867f9270dba6635ac3760a3b70bc929f5bd0f3bf582e45d27fbd437f528ca", - "handlers": [SFEMC, "http://172.40.231.200:9999/"], - "topic_identifier": "7a9867f9270dba6635ac3760a3b70bc929f5bd0f3bf582e45d27fbd437f528ca", - "task_api_endpoint": "/kapacitor/v1/tasks/7a9867f9270dba6635ac3760a3b70bc929f5bd0f3bf582e45d27fbd437f528ca", - "topic_api_endpoint": "/kapacitor/v1/alerts/topics/7a9867f9270dba6635ac3760a3b70bc929f5bd0f3bf582e45d27fbd437f528ca", - "topic_handlers_api_endpoint": "/kapacitor/v1/alerts/topics/7a9867f9270dba6635ac3760a3b70bc929f5bd0f3bf582e45d27fbd437f528ca/handlers"}, - {"policy": "scale_nginx_policy", "trigger": "increase_in_running_processes", "task_identifier": "f5edaeb27fb847116be749c3815d240cbf0d7ba79aee1959daf0b3445a70f2c8", - "handlers": [SFEMC, "http://172.40.231.200:9999/"], - "topic_identifier": "f5edaeb27fb847116be749c3815d240cbf0d7ba79aee1959daf0b3445a70f2c8", - "task_api_endpoint": "/kapacitor/v1/tasks/f5edaeb27fb847116be749c3815d240cbf0d7ba79aee1959daf0b3445a70f2c8", - "topic_api_endpoint": "/kapacitor/v1/alerts/topics/f5edaeb27fb847116be749c3815d240cbf0d7ba79aee1959daf0b3445a70f2c8", - "topic_handlers_api_endpoint": "/kapacitor/v1/alerts/topics/f5edaeb27fb847116be749c3815d240cbf0d7ba79aee1959daf0b3445a70f2c8/handlers"}, - {"policy": "deadman_policy", "trigger": "no_measurements", "task_identifier": "f7dab6fd53001c812d44533d3bbb6ef45f0d1d39b9441bc3c60402ebda85d320", - "handlers": [SFEMC, "http://172.40.231.200:9999/"], - "topic_identifier": "f7dab6fd53001c812d44533d3bbb6ef45f0d1d39b9441bc3c60402ebda85d320", - "task_api_endpoint": "/kapacitor/v1/tasks/f7dab6fd53001c812d44533d3bbb6ef45f0d1d39b9441bc3c60402ebda85d320", - "topic_api_endpoint": "/kapacitor/v1/alerts/topics/f7dab6fd53001c812d44533d3bbb6ef45f0d1d39b9441bc3c60402ebda85d320", - "topic_handlers_api_endpoint": "/kapacitor/v1/alerts/topics/f7dab6fd53001c812d44533d3bbb6ef45f0d1d39b9441bc3c60402ebda85d320/handlers"} - ] - response = get("http://{0}/clmc-service/alerts/{1}/{2}".format(clmc_service_host, sfc, sfc_instance)) assert response.status_code == 200, response.text clmc_service_response = response.json() @@ -200,11 +228,24 @@ class TestAlerts(object): assert response.status_code == 200 sleep(0.25) - print("Wait 15 seconds for Kapacitor to trigger alerts...") - sleep(15) + print("Waiting for Kapacitor to trigger alerts, timeouts in 90 seconds...") + counter = 0 + + expected_alerts_count = len(expected_alerts_list) + while True: + alert_logs = listdir(LOG_TEST_FOLDER_PATH) + print("Checking count of alert log files: actual count {0}, expected count {1}. Created alert logs: {2}".format(len(alert_logs), expected_alerts_count, alert_logs)) + if len(alert_logs) == expected_alerts_count: + break + elif counter == 18: + print("90 seconds have passed...") + break + + sleep(5) + counter += 1 alert_logs = listdir(LOG_TEST_FOLDER_PATH) - assert len(alert_logs) == 4, "4 log files must have been created - one for each alert defined in the specification." + assert len(alert_logs) == expected_alerts_count, "{0} log files must have been created - one for each alert defined in the specification. Created alert logs: {1}".format(expected_alerts_count, alert_logs) # check the content of each log file for alert_log in alert_logs: @@ -213,13 +254,15 @@ class TestAlerts(object): with open(alert_log_path) as fh: alert_json = load(fh) + error = None try: JSON_BODY_SCHEMA.validate(alert_json) valid = True - except SchemaError: + except SchemaError as e: + error = e valid = False - assert valid, "Alert log content is invalid - {0}".format(alert_log_path) + assert valid, "Alert log content is invalid - {0}, error - {1}".format(alert_log_path, error) # delete the alerts with a DELETE request through the TOSCA specification with open(alerts_spec, 'rb') as alerts: @@ -231,7 +274,8 @@ class TestAlerts(object): json_response = response.json() # sort by trigger to ensure comparison order is correct assert sorted(json_response["deleted_alerts"], key=lambda x: x['trigger']) == [{"policy": "scale_nginx_policy", "trigger": "high_requests"}, {"policy": "scale_nginx_policy", "trigger": "increase_in_active_requests"}, - {"policy": "scale_nginx_policy", "trigger": "increase_in_running_processes"}, {"policy": "deadman_policy", "trigger": "no_measurements"}], \ + {"policy": "scale_nginx_policy", "trigger": "increase_in_running_processes"}, {"policy": "deadman_policy", "trigger": "no_measurements"}, + {"policy": "deadman_policy", "trigger": "service_started"}], \ "Incorrect list of deleted alerts" # sort by handler and trigger to ensure comparison order is correct assert sorted(json_response["deleted_handlers"], key=lambda x: (x['handler'], x['trigger'])) == [{"policy": "scale_nginx_policy", "trigger": "increase_in_active_requests", "handler": SFEMC}, @@ -240,7 +284,8 @@ class TestAlerts(object): {"policy": "scale_nginx_policy", "trigger": "high_requests", "handler": "http://172.40.231.200:9999/"}, {"policy": "scale_nginx_policy", "trigger": "increase_in_active_requests", "handler": "http://172.40.231.200:9999/"}, {"policy": "scale_nginx_policy", "trigger": "increase_in_running_processes", "handler": "http://172.40.231.200:9999/"}, - {"policy": "deadman_policy", "trigger": "no_measurements", "handler": "http://172.40.231.200:9999/"}], \ + {"policy": "deadman_policy", "trigger": "no_measurements", "handler": "http://172.40.231.200:9999/"}, + {"policy": "deadman_policy", "trigger": "service_started", "handler": "http://172.40.231.200:9999/"}], \ "Incorrect list of deleted handlers" def test_alerts_update_request(self, rspec_config): @@ -280,7 +325,7 @@ class TestAlerts(object): # find the latest timestamp of the registered alerts max_post_timestamp = 0 tasks = get("http://{0}/kapacitor/v1/tasks".format(clmc_service_host)).json()["tasks"] - for timestamp in tasks_timestamps(tasks, sfc, sfc_instance): + for timestamp in tasks_timestamps(tasks): max_post_timestamp = max(max_post_timestamp, timestamp) delay = 2 # seconds @@ -303,7 +348,7 @@ class TestAlerts(object): # find the earliest timestamp of the updated alerts min_put_timestamp = float("inf") tasks = get("http://{0}/kapacitor/v1/tasks".format(clmc_service_host)).json()["tasks"] - for timestamp in tasks_timestamps(tasks, sfc, sfc_instance): + for timestamp in tasks_timestamps(tasks): min_put_timestamp = min(min_put_timestamp, timestamp) print("Latest timestamp during the POST request", max_post_timestamp, "Earliest timestamp during the PUT request", min_put_timestamp) @@ -312,7 +357,7 @@ class TestAlerts(object): # delete the alerts with a DELETE request for the SFC instance response = delete("http://{0}/clmc-service/alerts/{1}/{2}".format(clmc_service_host, sfc, sfc_instance)) assert response.status_code == 200, "Incorrect status code returned after deleting the alert specification" - assert response.json() == {"deleted_alerts_count": 4} + assert response.json() == {"deleted_alerts_count": len(expected_alerts_list)} # create the alerts again so that they can be deleted with another endpoint with open(alerts_spec, 'rb') as alerts: @@ -324,7 +369,7 @@ class TestAlerts(object): # delete the alerts with a DELETE request for all SFC instances response = delete("http://{0}/clmc-service/alerts/{1}".format(clmc_service_host, sfc)) assert response.status_code == 200, "Incorrect status code returned after deleting the alert specification" - assert response.json() == {sfc_instance: {"deleted_alerts_count": 4}} + assert response.json() == {sfc_instance: {"deleted_alerts_count": len(expected_alerts_list)}} def test_data_deletion(rspec_config): @@ -385,20 +430,43 @@ def test_data_deletion(rspec_config): print("SFC data has been successfully deleted") -def tasks_timestamps(all_tasks, sfc_id, sfc_instance_id): +def tasks_timestamps(all_tasks): """ Generates the timestamps for the tasks related to the given SFC and SFC instance. :param all_tasks: the full list of tasks from kapacitor - :param sfc_id: SFC identifier - :param sfc_instance_id: SFC instance identifier """ for task in all_tasks: # get the configured variables of this alert task_config = task["vars"] # if configured for this SFC instance - if task_config["sfc"]["value"] == sfc_id and task_config["sfci"]["value"] == sfc_instance_id: + if task_config["sfc"]["value"] == sfc and task_config["sfci"]["value"] == sfc_instance: created_datestr = task["created"][:26] # ignore the timezone and only take the first 6 digits of the microseconds task_created_timestamp = datetime.datetime.strptime(created_datestr, "%Y-%m-%dT%H:%M:%S.%f") yield task_created_timestamp.timestamp() + + +def cleanup_alerts(clmc_service_host): + + tasks = get("http://{0}/kapacitor/v1/tasks".format(clmc_service_host)).json()["tasks"] + + for task in tasks: + # get the configured variables of this alert + task_config = task["vars"] + # if configured for this SFC instance + if task_config["sfc"]["value"] == sfc and task_config["sfci"]["value"] == sfc_instance: + task_id = task["id"] + kapacitor_response = delete("http://{0}/kapacitor/v1/tasks/{1}".format(clmc_service_host, task_id)) # delete alert task + assert kapacitor_response.status_code == 204 + + # delete the handlers + topic_id = task_id + for handler in get("http://{0}/kapacitor/v1/alerts/topics/{1}/handlers".format(clmc_service_host, topic_id)).json()["handlers"]: + handler_id = handler["id"] + kapacitor_response = delete("http://{0}/kapacitor/v1/alerts/topics/{1}/handlers/{2}".format(clmc_service_host, topic_id, handler_id)) # delete handler + assert kapacitor_response.status_code == 204 + + # delete the alert topic + kapacitor_response = delete("http://{0}/kapacitor/v1/alerts/topics/{1}".format(clmc_service_host, topic_id)) # delete topic + assert kapacitor_response.status_code == 204