diff --git a/clmctest/monitoring/E2EAggregator.py b/clmctest/monitoring/E2EAggregator.py
index 8bef3d921ef0345fc3db6eb7ef6933b6c7101f34..58ec908fa28e484e93e63866539db0b1df809851 100644
--- a/clmctest/monitoring/E2EAggregator.py
+++ b/clmctest/monitoring/E2EAggregator.py
@@ -59,6 +59,10 @@ class Aggregator(Thread):
         # a stop flag event object used to handle the killing of the thread
         self._stop_flag = Event()
 
+        # a cache-like dictionaries to store the last reported values, which can be used to fill in missing values
+        self.network_cache = {}
+        self.service_cache = {}
+
     def stop(self):
         """
         A method used to stop the thread.
@@ -84,61 +88,73 @@ class Aggregator(Thread):
             # query the network delays and group them by path ID
             network_delays = {}
             result = self.db_client.query(
-                'SELECT mean(delay) as "net_delay" FROM "E2EMetrics"."autogen"."network_delays" WHERE time >= {0} and time < {1} GROUP BY path, source, target'.format(
+                'SELECT mean(latency) as "net_latency", mean(bandwidth) as "net_bandwidth" FROM "E2EMetrics"."autogen"."network_delays" WHERE time >= {0} and time < {1} GROUP BY path, source, target'.format(
                     boundary_time_nano, current_time_nano))
             for item in result.items():
                 metadata, result_points = item
                 # measurement = metadata[0]
                 tags = metadata[1]
 
-                network_delays[(tags['path'], tags['source'], tags['target'])] = next(result_points)['net_delay']
+                result = next(result_points)
+                network_delays[(tags['path'], tags['source'], tags['target'])] = result['net_latency'], result['net_bandwidth']
+                self.network_cache[(tags['path'], tags['source'], tags['target'])] = result['net_latency'], result['net_bandwidth']
 
             # query the service delays and group them by endpoint, service function instance and sfr
             service_delays = {}
-            result = self.db_client.query('SELECT mean(response_time) as "response_time" FROM "E2EMetrics"."autogen"."service_delays" WHERE time >= {0} and time < {1} GROUP BY endpoint, sf_instance, sfr'.format(boundary_time_nano, current_time_nano))
+            result = self.db_client.query('SELECT mean(response_time) as "response_time", mean(request_size) as "request_size", mean(response_size) as "response_size" FROM "E2EMetrics"."autogen"."service_delays" WHERE time >= {0} and time < {1} GROUP BY endpoint, sf_instance, sfr'.format(boundary_time_nano, current_time_nano))
             for item in result.items():
                 metadata, result_points = item
                 # measurement = metadata[0]
                 tags = metadata[1]
-                service_delays[tags['sfr']] = (next(result_points)['response_time'], tags['endpoint'], tags['sf_instance'])
+                result = next(result_points)
+                service_delays[tags['sfr']] = (result['response_time'], result['request_size'], result['response_size'], tags['endpoint'], tags['sf_instance'])
+                self.service_cache[tags['sfr']] = (result['response_time'], result['request_size'], result['response_size'], tags['endpoint'], tags['sf_instance'])
 
             # for each network path check if there is a media service delay report for the target sfr - if so, generate an e2e_delay measurement
             for path in network_delays:
                 # check if target sfr is reported in service delays, in other words - if there is a media service instance being connected to target sfr
                 path_id, source, target = path
-                if target not in service_delays:
+                if target not in service_delays and target not in self.service_cache:
                     # if not continue with the other network path reports
                     continue
 
                 e2e_arguments = {"path_ID": None, "source_SFR": None, "target_SFR": None, "endpoint": None, "sf_instance": None, "delay_forward": None, "delay_reverse": None,
-                                 "delay_service": None, "time": boundary_time}
+                                 "delay_service": None, "avg_request_size": None, "avg_response_size": None, "avg_bandwidth": None, "time": boundary_time}
 
                 e2e_arguments['path_ID'] = path_id
-                e2e_arguments['delay_forward'] = network_delays[path]
+                e2e_arguments['source_SFR'] = source
+                e2e_arguments['target_SFR'] = target
+                e2e_arguments['delay_forward'] = network_delays[path][0]
+                e2e_arguments['avg_bandwidth'] = network_delays[path][1]
 
                 # reverse the path ID to get the network delay for the reversed path
                 reversed_path = (path_id, target, source)
-                assert reversed_path in network_delays  # reversed path must always be reported with the forward one - if there is network path A-B, there is also network path B-A
-                e2e_arguments['delay_reverse'] = network_delays[reversed_path]
+                if reversed_path in network_delays or reversed_path in self.network_cache:
+                    # get the reverse delay, use the latest value if reported or the cache value
+                    e2e_arguments['delay_reverse'] = network_delays.get(reversed_path, self.network_cache.get(reversed_path))[0]
+                else:
+                    e2e_arguments['delay_reverse'] = None
 
                 # get the response time of the media component connected to the target SFR
-                service_delay = service_delays[target]
-                response_time, endpoint, sf_instance = service_delay
+                service_delay = service_delays.get(target, self.service_cache.get(target))
+                response_time, request_size, response_size, endpoint, sf_instance = service_delay
                 # put these points in the e2e arguments dictionary
                 e2e_arguments['delay_service'] = response_time
+                e2e_arguments['avg_request_size'] = request_size
+                e2e_arguments['avg_response_size'] = response_size
                 e2e_arguments['endpoint'] = endpoint
                 e2e_arguments['sf_instance'] = sf_instance
 
                 # if all the arguments of the e2e delay measurements were reported, then generate and post to Influx an E2E measurement row
-                if None not in e2e_arguments.items():
+                if None not in e2e_arguments.values():
                     self.db_client.write_points(
                         lp.generate_e2e_delay_report(e2e_arguments['path_ID'], e2e_arguments['source_SFR'], e2e_arguments['target_SFR'], e2e_arguments['endpoint'],
                                                      e2e_arguments['sf_instance'], e2e_arguments['delay_forward'], e2e_arguments['delay_reverse'], e2e_arguments['delay_service'],
-                                                     e2e_arguments['time']))
+                                                     e2e_arguments["avg_request_size"], e2e_arguments['avg_response_size'], e2e_arguments['avg_bandwidth'], e2e_arguments['time']))
 
             old_timestamp = current_time
             # wait until {REPORT_PERIOD} seconds have passed
-            while current_time != old_timestamp + self.REPORT_PERIOD:
+            while current_time < old_timestamp + self.REPORT_PERIOD:
                 sleep(1)
                 current_time = int(time())
 
diff --git a/clmctest/monitoring/E2ESim.py b/clmctest/monitoring/E2ESim.py
index a8974f880916ff5b568b98a859e3161b39f17d03..a6520d280b52353ff7120746949a9127ecbed875 100644
--- a/clmctest/monitoring/E2ESim.py
+++ b/clmctest/monitoring/E2ESim.py
@@ -74,49 +74,79 @@ class Simulator(object):
 
         # all network delays start from 1ms, the dictionary stores the information to report
         paths = [
-            {'target': 'SR3',
-             'source': 'SR1',
-             'path_id': 'SR1---SR3',
-             'network_delay': 1},
-            {'target': 'SR1',
-             'source': 'SR3',
-             'path_id': 'SR1---SR3',
-             'network_delay': 1}
+            {
+                'target': 'SR2',
+                'source': 'SR1',
+                'path_id': 'SR1---SR2',
+                'latency': 5,
+                'bandwidth': 100*1024*1024
+            },
+            {
+                'target': 'SR1',
+                'source': 'SR2',
+                'path_id': 'SR1---SR2',
+                'latency': 5,
+                'bandwidth': 100*1024*1024
+            },
+            {
+                'target': 'SR3',
+                'source': 'SR1',
+                'path_id': 'SR1---SR3',
+                'latency': 5,
+                'bandwidth': 100*1024*1024
+            },
+            {
+                'target': 'SR1',
+                'source': 'SR3',
+                'path_id': 'SR1---SR3',
+                'latency': 5,
+                'bandwidth': 100*1024*1024
+            }
         ]
 
+        service_function_instances = [
+            {
+                'endpoint': 'ms1.flame.org',
+                'sf_instance': 'sr2.ms1.flame.org',  # TODO: what did we decide the sf_instance would look like?
+                'sfr': 'SR2',
+                'service_delay': 40,
+                'cpus': 1
+            },
+            {
+                'endpoint': 'ms1.flame.org',
+                'sf_instance': 'sr3.ms1.flame.org',  # TODO: what did we decide the sf_instance would look like?
+                'sfr': 'SR3',
+                'service_delay': 10,
+                'cpus': 4
+            }
+        ]
+
+        av_request_size = 10 * 1024 * 1024  # average request size measured by service function / Bytes
+        av_response_size = 1 * 1024  # average request size measured by service function / Bytes
+
         # current time in seconds (to test the aggregation we write influx data points related to future time), so we start from the current time
         start_time = int(time.time())
 
         sim_time = start_time
 
-        mean_delay_seconds_media = 10  # initial mean media service delay
-        sample_period_net = 2  # sample period for reporting network delays (measured in seconds) - net measurements reported every 2s
-        sample_period_media = 5  # sample period for reporting media service delays (measured in seconds) - service measurements reported every 5 seconds
+        sample_period_net = 1  # sample period for reporting network delays (measured in seconds)
+        sample_period_media = 5  # sample period for reporting media service delays (measured in seconds)
 
         for i in range(0, self.SIMULATION_LENGTH):
-            # measure net delay every 2 seconds for path SR1-SR3 (generates on tick 0, 2, 4, 6, 8, 10.. etc.)
+            # report one of the network delays every sample_period_net seconds
             if i % sample_period_net == 0:
-                path = paths[0]
-                self.db_client.write_points(lp.generate_network_delay_report(path['path_id'], path['source'], path['target'], path['network_delay'], sim_time))
+                path = random.choice(paths)
+                self.db_client.write_points(
+                    lp.generate_network_delay_report(path['path_id'], path['source'], path['target'], path['latency'], path['bandwidth'], sim_time))
 
                 # increase/decrease the delay in every sample report (min delay is 1)
-                path['network_delay'] = max(1, path['network_delay'] + random.randint(-3, 3))
-
-            # measure net delay every 2 seconds for path SR2-SR3 (generates on tick 1, 3, 5, 7, 9, 11.. etc.)
-            if (i+1) % sample_period_net == 0:
-                path = paths[1]
-                self.db_client.write_points(lp.generate_network_delay_report(path['path_id'], path['source'], path['target'], path['network_delay'], sim_time))
+                path['latency'] = max(1, path['latency'] + random.randint(-3, 3))
 
-                # increase/decrease the delay in every sample report (min delay is 1)
-                path['network_delay'] = max(1, path['network_delay'] + random.randint(-3, 3))
-
-            # measure service response time every 5 seconds
+            # report one of the service_function_instance response times every sample_period_media seconds
             if i % sample_period_media == 0:
-                self.db_client.write_points(lp.generate_service_delay_report(mean_delay_seconds_media, "endpoint-1",
-                                                                             "ms-A.ict-flame.eu", "SR3",  sim_time))
-
-                # increase/decrease the delay in every sample report (min delay is 10)
-                mean_delay_seconds_media = max(10, mean_delay_seconds_media + random.choice([random.randint(10, 20), random.randint(-20, -10)]))
+                service = random.choice(service_function_instances)
+                self.db_client.write_points(lp.generate_service_delay_report(
+                    service['endpoint'], service['sf_instance'], service['sfr'], service['service_delay'], av_request_size, av_response_size, sim_time))
 
             # increase the time by one simulation tick
             sim_time += self.TICK
diff --git a/clmctest/monitoring/LineProtocolGenerator.py b/clmctest/monitoring/LineProtocolGenerator.py
index 432f27d41769bcf68d776b91f719d9f4bcb8d122..b9bf9a6249234131506e9fddca2fa77af8450ede 100644
--- a/clmctest/monitoring/LineProtocolGenerator.py
+++ b/clmctest/monitoring/LineProtocolGenerator.py
@@ -29,18 +29,21 @@ import uuid
 from random import randint
 
 
-def generate_e2e_delay_report(path_id, source_sfr, target_sfr, endpoint, sf_instance, delay_forward, delay_reverse, delay_service, time):
+def generate_e2e_delay_report(path_id, source_sfr, target_sfr, endpoint, sf_instance, delay_forward, delay_reverse, delay_service, avg_request_size, avg_response_size, avg_bandwidth, time):
     """
     Generates a combined averaged measurement about the e2e delay and its contributing parts
 
-    :param path_ID: The path identifier, which is a bidirectional path ID for the request and the response path
-    :param source_SFR: source service router
-    :param target_SFR: target service router
+    :param path_id: The path identifier, which is a bidirectional path ID for the request and the response path
+    :param source_sfr: source service router
+    :param target_sfr: target service router
     :param endpoint: endpoint of the media component
     :param sf_instance: service function instance (media component)
     :param delay_forward: Path delay (Forward direction)
     :param delay_reverse: Path delay (Reverse direction)
     :param delay_service: the media service component response time
+    :param avg_request_size: averaged request size
+    :param avg_response_size: averaged response size
+    :param avg_bandwidth: averaged bandwidth
     :param time: measurement timestamp
     :return: a list of dict-formatted reports to post on influx
     """
@@ -56,7 +59,10 @@ def generate_e2e_delay_report(path_id, source_sfr, target_sfr, endpoint, sf_inst
                "fields": {
                    "delay_forward": float(delay_forward),
                    "delay_reverse": float(delay_reverse),
-                   "delay_service": float(delay_service)
+                   "delay_service": float(delay_service),
+                   "avg_request_size": float(avg_request_size),
+                   "avg_response_size": float(avg_response_size),
+                   "avg_bandwidth": float(avg_bandwidth)
                },
                "time": _getNSTime(time)
                }]
@@ -64,14 +70,15 @@ def generate_e2e_delay_report(path_id, source_sfr, target_sfr, endpoint, sf_inst
     return result
 
 
-def generate_network_delay_report(path_id, source_sfr, target_sfr, e2e_delay, time):
+def generate_network_delay_report(path_id, source_sfr, target_sfr, latency, bandwidth, time):
     """
     Generates a platform measurement about the network delay between two specific service routers.
 
     :param path_id: the identifier of the path between the two service routers
     :param source_sfr: the source service router
     :param target_sfr: the target service router
-    :param e2e_delay: the e2e network delay for traversing the path between the two service routers
+    :param latency: the e2e network delay for traversing the path between the two service routers
+    :param bandwidth: the bandwidth of the path (minimum of bandwidths of the links it is composed of)
     :param time: the measurement timestamp
     :return: a list of dict-formatted reports to post on influx
     """
@@ -83,7 +90,8 @@ def generate_network_delay_report(path_id, source_sfr, target_sfr, e2e_delay, ti
                    "target": target_sfr
                },
                "fields": {
-                   "delay": e2e_delay
+                   "latency": latency,
+                   "bandwidth": bandwidth
                },
                "time": _getNSTime(time)
                }]
@@ -91,14 +99,16 @@ def generate_network_delay_report(path_id, source_sfr, target_sfr, e2e_delay, ti
     return result
 
 
-def generate_service_delay_report(response_time, endpoint, sf_instance, sfr, time):
+def generate_service_delay_report(endpoint, sf_instance, sfr, response_time, request_size, response_size, time):
     """
     Generates a service measurement about the media service response time.
 
-    :param response_time: the media service response time (This is not the response time for the whole round-trip, but only for the processing part of the media service component)
     :param endpoint: endpoint of the media component
     :param sf_instance: service function instance
     :param sfr: the service function router that connects the endpoint of the SF instance to the FLAME network
+    :param response_time: the media service response time (this is not the response time for the whole round-trip, but only for the processing part of the media service component)
+    :param request_size: the size of the request received by the service in Bytes
+    :param response_size: the size of the response received by the service in Bytes
     :param time: the measurement timestamp
     :return: a list of dict-formatted reports to post on influx
     """
@@ -111,6 +121,8 @@ def generate_service_delay_report(response_time, endpoint, sf_instance, sfr, tim
                },
                "fields": {
                    "response_time": response_time,
+                   "request_size": request_size,
+                   "response_size": response_size
                },
                "time": _getNSTime(time)
                }]
diff --git a/clmctest/monitoring/test_e2eresults.py b/clmctest/monitoring/test_e2eresults.py
index 5486bcb618f645dbdbc69887adf6000c2c423e08..8e48b6e89ce0617f7f16549ee761b1aa14dd5dc4 100644
--- a/clmctest/monitoring/test_e2eresults.py
+++ b/clmctest/monitoring/test_e2eresults.py
@@ -46,6 +46,8 @@ class TestE2ESimulation(object):
         print("Starting aggregator...")
         e2e_aggregator.start()
 
+        time.sleep(1)
+
         print("Running simulation, please wait...")
         e2e_simulator.run()
 
@@ -56,17 +58,18 @@ class TestE2ESimulation(object):
         print("... stopping aggregator")
         e2e_aggregator.stop()
 
-
     @pytest.mark.parametrize("query, expected_result", [
         ('SELECT count(*) FROM "E2EMetrics"."autogen"."network_delays"',
-         {"time": "1970-01-01T00:00:00Z", "count_delay": 120}),
+         {"time": "1970-01-01T00:00:00Z", "count_latency": 120, "count_bandwidth": 120}),
         ('SELECT count(*) FROM "E2EMetrics"."autogen"."service_delays"',
-         {"time": "1970-01-01T00:00:00Z", "count_response_time": 24}),
+         {"time": "1970-01-01T00:00:00Z", "count_response_time": 24, "count_request_size": 24, "count_response_size": 24}),
         ('SELECT count(*) FROM "E2EMetrics"."autogen"."e2e_delays"',
-         {"time": "1970-01-01T00:00:00Z", "count_delay_forward": 24, "count_delay_reverse": 24, "count_delay_service": 24}),
+         {"time": "1970-01-01T00:00:00Z", "count_delay_forward": 38, "count_delay_reverse": 38, "count_delay_service": 38,
+          "count_avg_request_size": 38, "count_avg_response_size": 38, "count_avg_bandwidth": 38}),
 
         ('SELECT mean(*) FROM "E2EMetrics"."autogen"."e2e_delays"',
-         {"time": "1970-01-01T00:00:00Z", "mean_delay_forward": 13.159722222222223, "mean_delay_reverse": 3.256944444444444, "mean_delay_service": 32.791666666666664}),
+         {"time": "1970-01-01T00:00:00Z", "mean_delay_forward": 8.048245614035087, "mean_delay_reverse": 13.043859649122808, "mean_delay_service": 23.42105263157895,
+          'mean_avg_request_size': 10485760, 'mean_avg_response_size': 1024, 'mean_avg_bandwidth': 104857600}),
         ])
     def test_simulation(self, influx_db, query, expected_result):
         """
diff --git a/docs/total-service-request-delay.md b/docs/total-service-request-delay.md
index 774c3616b10d852140934c67dc84fce568ca78e4..a27f47665f3f77341193feff8f74f51e89f5bb3b 100644
--- a/docs/total-service-request-delay.md
+++ b/docs/total-service-request-delay.md
@@ -16,6 +16,7 @@ If we ignore the OSI L6 protocol (e.g. HTTP, FTP, Tsunami) then we are modelling
 
 ```
 network_delay = latency + (time difference from start of the data to the end of the data)
+              = latency + data_delay
 ```
 
 ### Latency
@@ -24,10 +25,10 @@ The latency (or propagation delay) of the network path is the time taken for a p
 
 latency = distance / speed
 
-For optical fibre (or even an eletric wire), the speed naively would be the speed of light. In fact, the speed is slower than this (in optical fibre this is because of the internal refraction that occurs, which is different for different wavelengths). According to http://www.m2optics.com/blog/bid/70587/Calculating-Optical-Fiber-Latency the delay (1/speed) is approximately 5 microseconds / km
+For optical fibre (or even an eletric wire), the speed naively would be the speed of light. In fact, the speed is slower than this (in optical fibre this is because of the internal refraction that occurs, which is different for different wavelengths). According to [m2.optics.com](http://www.m2optics.com/blog/bid/70587/Calculating-Optical-Fiber-Latency) the delay (1/speed) is approximately 5 microseconds / km
 
 ```
-if 
+if
     distance is in m
     delay is in s/m
     latency is in s
@@ -48,35 +49,179 @@ if
     data_size is in Bytes
     bandwidth is in Mb/s
     data_delay is in s
-then 
+then
     data_delay = data_size * 8 / bandwidth * 1E6
 ```
 
-The data_size naively is the size of the data you want to send over the network (call this the "file_size"). However, the data is split into packets and each packet has a header on it so the amount of data going over the network is actually more than the amount sent.
+The data_size naively is the size of the data you want to send over the network (call this the "file_size"). However, the data is split into packets and each packet has a header on it so the amount of data going over the network is actually more than the amount sent. The header includes contributions from (at least) the L6 protocol (e.g. HTTP), L4 (e.g. TCP) and L3 (e.g. IP) layers.
 
 ```
-let 
+let
     packet_size = packet_header_size + packet_payload_size
 then
     data_size = (packet_size / packet_payload_size) * file_size
 or
-    data_size = (packet_size / packet_size - packet_header_size) * file_size
+    data_size = [packet_size / (packet_size - packet_header_size)] * file_size
+              = file_size * packet_size / (packet_size - packet_header_size)
 ```
 
-### Total delay
+### Measuring and Predicting
+
+Bringing the above parts together we have:
 
 ```
-delay = latency + data_delay
-    = (distance * 5 / 1E9) + {[(packet_size / packet_size - packet_header_size) * file_size] * 8 / bandwidth * 1E6}
+network_delay = latency + data_delay
+              = (distance * 5 / 1E9) + {[file_size * packet_size / (packet_size - packet_header_size)] * 8 / (bandwidth * 1E6)}
+              = (distance * 5 / 1E9) + (8 / 1E6) * (file_size / bandwidth) * [packet_size / (packet_size - packet_header_size)]
 ```
 
-### Effect of Protocol
+i.e. `file_size / bandwidth` with an adjustment to increase the size of the data transmitted because of the packet header and some unit factors.
+
+We want to be able to measure the `network_delay` and also want to be able to predict what the delay is likely to be for a given deployment.
+
+Parameter | Known / measured
+----------|--------
+latency | measured by network probes
+distance | sometimes known
+packet_size | known (a property of the network)
+packet_header_size | known (at least for L3 and L4)
+file_size | measured at the service function
+bandwidth | known (a property of the network), can also be measured
+
+Measuring the actual `latency` can be done in software. For a given `file_size`, the `network_delay` could then be predicted.
+
+*We are ignoring network congestion and the effect of the protocol (see below).*
+
+### Effect of protocol
 
-The choice of protocol has a large effect in networks with a high bandwidth-delay product.
+The analysis above ignores the network protocol. However, the choice of protocol has a large effect in networks with a high bandwidth-delay product.
 
 In data communications, bandwidth-delay product is the product of a data link's capacity (in bits per second) and its round-trip delay time (in seconds). The result, an amount of data measured in bits (or bytes), is equivalent to the maximum amount of data on the network circuit at any given time, i.e., data that has been transmitted but not yet acknowledged.
 
 TCP for instance expects acknowledgement of every packet sent and if the sender has not received an acknowledgement within a specified time period then the packet will be retransmitted. Furthermore, TCP uses a flow-control method whereby the receiver specifies how much data it is willing to buffer and the sending host must pause sending and wait for acknowledgement once that amount of data is sent.
 
+### Effect of congestion
+
+The analysis above considers the best case where the whole bandwidth of the link is available for the data transfer.
+
 ## Service Delay
 
+A particular service function may have several operations (API calls) on it. A model of service function performance needs to consider the resource the service function is deployed upon (and its variability and reliability), the availability of the resource (i.e. whether the service function have the resource to itself), the workload (a statistical distribution of API calls and request sizes) and the speed at which the resource can compute the basic computations invoked by the requests.
+
+We must simplify sufficiently to make the problem tractable but not too much so that the result is of no practical use.
+
+To simplify we can:
+
+* assume that the resource is invariable, 100% available and 100% reliable;
+* assume that the distribution of API calls is constant and that the workload can be represented sufficiently by the average request size.
+
+To be concrete, if a service function has two API calls: `transcode_video(video_data)` and `get_status()` then we would like to model the average response time over "normal" usage which might be 10% of calls to `transcode_video` and 90% of calls to `get_status` and a variety of `video_data` sizes with a defined average size.
+
+### Measuring
+
+As an example, the `minio` service reports the average response time over all API calls already so, for that service at least, measuring the `service_delay` is easy. We expect to also be able to measure the average `file_size` which will do as a measure of workload.
+
+### Predicting
+
+As noted above, a simple model must consider:
+
+* the resource the service function is deployed upon (e.g. CPU, memory, disk);
+* the workload (an average request size)
+* the speed at which the resource can compute the basic computations invoked by the requests (dependent on the service function).
+
+We can therefore write that:
+
+```
+service_delay = f(resource, workload, service function characteristics)
+```
+
+For our simplified workload we could assume that this can be written as:
+
+```
+service_delay = workload * f(resource, service function characteristics)
+```
+
+The resource could be described in terms of the number of CPUs, amount of RAM and amount of disk. Even if the resource was a physical machine more detail would be required such as the CPU clock speed, CPU cache sizes, RAM speed, disk speed, etc. In a virtualised environment it is even more complicated as elements of the physical CPU may or may not be exposed to the virtual CPU (which may in fact be emulated).
+
+Benchmarks are often used to help measure the performance of a resource so that one resource may be compared to another without going into all the detail of the precise artchitecture. Application benchmarks (those executing realistic workloads such as matrix operations or fast fourier transforms) can be more useful than general benchmark scores (such as SPECint or SPECfp). For more information on this, see [Snow White Clouds and the Seven Dwarfs](https://eprints.soton.ac.uk/273157/1/23157.pdf).
+
+The best benchmark for a service function is the service function itself combined with a representative workload. That is, to predict the performance of a service function on a given resource, it is best to just run it and find out. In the absence of that, the next best would be to execute Dwarf benchmarks on each resource type and correlate them with the service functions, but that is beyond what we can do now.
+
+We might execute a single benchmark such as the [Livermoor Loops](http://www.netlib.org/benchmark/livermorec) benchmark which stresses a variety of CPU operations. Livermoor Loops provides a single benchmark figure in Megaflops/sec.
+
+Our service_delay equation would then just reduce to:
+
+```
+service_delay = workload * f(benchmark, service function characteristics)
+              = workload * service_function_scaling_factor / benchmark
+```
+
+The `service_function_scaling_factor` essentially scales the `workload` number into a number of Megaflops. So for a `workload` in bytes the `service_function_scaling_factor` would be representing Megaflops/byte.
+
+If we don't have a benchmark then the best we can do is approximate the benchmark by the number of CPUs:
+
+```
+service_delay = workload * f(benchmark, service function characteristics)
+              = workload * service_function_scaling_factor / cpus
+```
+
+Is this a simplification too far? It ignores the size of RAM for instance which cannot normally be included as a linear factor (i.e. twice as much RAM does not always give twice the performance). Not having sufficient RAM results in disk swapping or complete failure. Once you have enough for a workload, adding more makes no difference.
+
+## Conclusion
+
+The total delay is:
+
+```
+total_delay = forward_network_delay + service_delay + reverse_network_delay
+```
+
+To *measure* the `total_delay` we need:
+
+```
+total_delay = forward_latency + forward_data_delay + service_delay + reverse_latency + reverse_data_delay
+            = forward_latency
+                + {(8 / 1E6) * (request_size / bandwidth) * [packet_size / (packet_size - packet_header_size)]}
+                + service_delay
+                + reverse_latency
+                + {(8 / 1E6) * (response_size / bandwidth) * [packet_size / (packet_size - packet_header_size)]}
+```
+
+With:
+
+* forward_latency / s (measured by network probe)
+* reverse_latency / s (measured by network probe)
+* request_size / Bytes (measured at service)
+* response_size / Bytes (measured at service)
+* bandwidth / Mb/s   (b = bit) (assumed constant and known or measured)
+* packet_size / Bytes (constant and known)
+* packet_header_size / Bytes (constant and known)
+
+This calculation assumes:
+
+* there is no network congestion, i.e. the whole bandwidth is available
+* that the protocol (such as TCP) has no effect (see discussion of flow control above)
+* there is no data loss on the network
+* that the service delay is proportional to the `request_size`, i.e. that the service is processing the data in the request
+* that the service does not start processing until the complete request is received
+* that the amount of memory and disk on the compute resource is irrelevant
+* that the service delay is inversely proprtional to the number of CPUs (and all CPUs are equal)
+* that the compute resource is invariable, 100% available and 100% reliable
+* that the distribution of API calls is constant and that the workload can be represented sufficiently by the average request size
+
+To *predict* the `total_delay` we need:
+
+```
+total_delay = forward_latency + forward_data_delay + service_delay + reverse_latency + reverse_data_delay
+            = forward_latency
+                + {(8 / 1E6) * (request_size / bandwidth) * [packet_size / (packet_size - packet_header_size)]}
+                + request_size * service_function_scaling_factor / cpus
+                + reverse_latency
+                + {(8 / 1E6) * (response_size / bandwidth) * [packet_size / (packet_size - packet_header_size)]}
+```
+
+With:
+
+* service_function_scaling_factor / Mflops/Byte (known, somehow)
+* cpus (unitless) (known)
+
+As discussed above, you could also predict the latency if you know the length of a link but this seems a bit too theoretical.