Compare commits

...

26 Commits

Author SHA1 Message Date
flowzone-app[bot]
b8032edc04
v16.12.8 2025-03-12 14:50:35 +00:00
flowzone-app[bot]
175872b358
Merge pull request #2408 from balena-os/fix-socket-timeout
Ensure poll socket timeout is defined early
2025-03-12 14:49:34 +00:00
Felipe Lalanne
ae337a1dd7
Remove GOT retries on state poll
The state poll already has retry implementation, making the GOT default
unnecessary.

Change-type: patch
2025-03-12 10:59:16 -03:00
Felipe Lalanne
bdbc6a4ba4
Ensure poll socket timeout is defined early
We have observed that even when setting the socket timeout on the
state poll https request, the timeout is only applied once the socket is
connected. This causes issues with Node's auto family selection (happy
eyeballs), as the default https timeout is 5s which means that larger
[auto select attempt timeout](https://nodejs.org/docs/latest-v22.x/api/net.html#netgetdefaultautoselectfamilyattempttimeout) may result in the socket timing out before all connection attempts have been tried.

This commit sets a different https Agent for state polling, with a
timeout matching the `apiRequestTimeout` used for other request events.

Change-type: patch
2025-03-12 10:59:11 -03:00
flowzone-app[bot]
978652b292
v16.12.7 2025-03-06 19:11:20 +00:00
flowzone-app[bot]
7771c0e96b
Merge pull request #2406 from balena-os/release-locks-on-app-remove
Release locks when removing apps
2025-03-06 19:10:38 +00:00
Felipe Lalanne
026dc0aed2
Release locks when removing apps
This prevents leftover locks that can prevent other operations from
taking place.

Change-type: patch
2025-03-06 11:50:31 -03:00
flowzone-app[bot]
5ef6b054fd
v16.12.6 2025-03-04 14:25:09 +00:00
flowzone-app[bot]
3cca2b7ecd
Merge pull request #2404 from balena-os/polling-improvements
Polling improvements
2025-03-04 14:24:18 +00:00
Felipe Lalanne
3d8bd28f5a
Update GOT to v14.4.6 2025-03-04 10:46:47 -03:00
Felipe Lalanne
6d00be2093
Log non-API errors during state poll
The supervisor was failing silently if an error happened while establishing the
connection (e.g. requesting the socket).

Change-type: patch
2025-03-04 10:46:45 -03:00
Felipe Lalanne
f8bdb14335
Fix target poll healthcheck
The Target.lastFetch time compared when performing the healthcheck
resets any time a poll is attempted no matter the outcome. This changes
the behavior so the time is reset only on a successful poll

Change-type: patch
2025-03-04 10:45:31 -03:00
flowzone-app[bot]
c88cf6a259
v16.12.5 2025-03-04 13:35:28 +00:00
Page-
906ce6dc0d
Merge pull request #2405 from balena-os/fix-api-request-timeout
Decrease balenaCloud api request timeout from 15m to 59s
2025-03-04 13:34:35 +00:00
Pagan Gazzard
49163e92a0 Decrease balenaCloud api request timeout from 15m to 59s
This was mistakenly increased due to confusion between the timeout for
requests to the supervisor's api vs the timeout for requests from the
supervisor to the balenaCloud api. This separates the two configs and
documents the difference between the timeouts whilst also decreasing
the timeout for balenaCloud api requests to the correct/expected value

Change-type: patch
2025-03-04 12:29:18 +00:00
flowzone-app[bot]
f67e45f432
v16.12.4 2025-03-03 13:42:20 +00:00
flowzone-app[bot]
91335051ac
Merge pull request #2403 from balena-os/dont-revert-to-regular-pull-if-401
Don't revert to regular pull if delta server 401
2025-03-03 13:41:29 +00:00
Christina Ying Wang
2dc9d275b1 Don't revert to regular pull if delta server 401
If the Supervisor receives a 401 Unauthorized from the delta server
when requesting a delta image location, we should surface the error
instead of falling back to a regular pull immediately, as there could
be an issue with the delta auth token, which refreshes after
DELTA_TOKEN_TIMEOUT (10min), or some other edge case.

Change-type: patch
Signed-off-by: Christina Ying Wang <christina@balena.io>
2025-02-24 16:17:15 -08:00
flowzone-app[bot]
b6f0ecba18
v16.12.3 2025-02-19 20:51:55 +00:00
flowzone-app[bot]
dd0253ff1f
Merge pull request #2396 from balena-os/switch-to-image-pull-if-delta-failure
Switch to image pull if delta failure
2025-02-19 20:50:58 +00:00
Christina Ying Wang
5936af37e7 Bump docker-progress to 5.2.4
Signed-off-by: Christina Ying Wang <christina@balena.io>
2025-02-12 13:49:09 -08:00
Christina Ying Wang
341111f1f9 Retry DELTA_APPLY_RETRY_COUNT (3) times during delta apply fail before reverting to regular pull
This prevents an image download error loop where the delta image on the delta server is present,
but some aspect of the delta image or the base image on the device does not match up, causing
the delta to fail to be applied to the base image.

Delta apply errors don't raise status codes as they are thrown from the Engine (although they should),
so if an error with a status code is raised during this time, throw an error to the handler
indicating that the delta should be retried until success. Errors with status codes raised during
this time are largely network related, so falling back to a regular pull won't improve anything.

Upon delta apply errors exceeding DELTA_APPLY_RETRY_COUNT, revert to a regular pull.

Change-type: patch
Signed-off-by: Christina Ying Wang <christina@balena.io>
2025-02-11 12:19:53 -08:00
Christina Ying Wang
1fc242200f Revert to regular pull immediately on delta server failure (code 400s)
If the delta server responds immediately with HTTP 4xx upon requesting a delta image,
this means the server is not able to supply the resource, so fall back to a regular pull
immediately.

Change-type: patch
Signed-off-by: Christina Ying Wang <christina@balena.io>
2025-02-11 10:58:51 -08:00
flowzone-app[bot]
5c94c61b0a
v16.12.2 2025-02-11 01:04:24 +00:00
balena-renovate[bot]
43426a4a26
Merge pull request #2401 from balena-os/renovate/balena-io-deploy-to-balena-action-2.0.x
Update balena-io/deploy-to-balena-action action to v2.0.92
2025-02-11 01:03:38 +00:00
balena-renovate[bot]
c57622e226
Update balena-io/deploy-to-balena-action action to v2.0.92
Update balena-io/deploy-to-balena-action from 2.0.74 to 2.0.92

Change-type: patch
2025-02-11 00:32:00 +00:00
22 changed files with 418 additions and 71 deletions

View File

@ -13,7 +13,7 @@ inputs:
runs: runs:
using: 'composite' using: 'composite'
steps: steps:
- uses: balena-io/deploy-to-balena-action@72b7652cd8b4b0b49376f60fe790eef9ba76e3f0 # v2.0.74 - uses: balena-io/deploy-to-balena-action@3cb4217ab3347a885b4fcdc44d5f3a4153145633 # v2.0.92
with: with:
balena_token: ${{ fromJSON(inputs.secrets).BALENA_STAGING_TOKEN }} balena_token: ${{ fromJSON(inputs.secrets).BALENA_STAGING_TOKEN }}
fleet: ${{ env.matrix_value }} fleet: ${{ env.matrix_value }}

View File

@ -13,7 +13,7 @@ inputs:
runs: runs:
using: "composite" using: "composite"
steps: steps:
- uses: balena-io/deploy-to-balena-action@72b7652cd8b4b0b49376f60fe790eef9ba76e3f0 # v2.0.74 - uses: balena-io/deploy-to-balena-action@3cb4217ab3347a885b4fcdc44d5f3a4153145633 # v2.0.92
with: with:
balena_token: ${{ fromJSON(inputs.secrets).BALENA_STAGING_TOKEN }} balena_token: ${{ fromJSON(inputs.secrets).BALENA_STAGING_TOKEN }}
fleet: ${{ env.matrix_value }} fleet: ${{ env.matrix_value }}

View File

@ -1,3 +1,179 @@
- commits:
- subject: Remove GOT retries on state poll
hash: ae337a1dd7743b0ee0a05c32a5ce01965c5bafef
body: |
The state poll already has retry implementation, making the GOT default
unnecessary.
footer:
Change-type: patch
change-type: patch
author: Felipe Lalanne
nested: []
- subject: Ensure poll socket timeout is defined early
hash: bdbc6a4ba4766f9466891497bc02bd33aff1d4c7
body: |
We have observed that even when setting the socket timeout on the
state poll https request, the timeout is only applied once the socket is
connected. This causes issues with Node's auto family selection (happy
eyeballs), as the default https timeout is 5s which means that larger
[auto select attempt timeout](https://nodejs.org/docs/latest-v22.x/api/net.html#netgetdefaultautoselectfamilyattempttimeout) may result in the socket timing out before all connection attempts have been tried.
This commit sets a different https Agent for state polling, with a
timeout matching the `apiRequestTimeout` used for other request events.
footer:
Change-type: patch
change-type: patch
author: Felipe Lalanne
nested: []
version: 16.12.8
title: ""
date: 2025-03-12T14:50:33.204Z
- commits:
- subject: Release locks when removing apps
hash: 026dc0aed29ce7d66cfdd8616d80d1f5daf3ad46
body: |
This prevents leftover locks that can prevent other operations from
taking place.
footer:
Change-type: patch
change-type: patch
author: Felipe Lalanne
nested: []
version: 16.12.7
title: ""
date: 2025-03-06T19:11:18.704Z
- commits:
- subject: Log non-API errors during state poll
hash: 6d00be20930398699da1006176dac1e81b2dbbd6
body: >
The supervisor was failing silently if an error happened while
establishing the
connection (e.g. requesting the socket).
footer:
Change-type: patch
change-type: patch
author: Felipe Lalanne
nested: []
- subject: Fix target poll healthcheck
hash: f8bdb1433508dcaeff12a78d746256041ba1c414
body: |
The Target.lastFetch time compared when performing the healthcheck
resets any time a poll is attempted no matter the outcome. This changes
the behavior so the time is reset only on a successful poll
footer:
Change-type: patch
change-type: patch
author: Felipe Lalanne
nested: []
version: 16.12.6
title: ""
date: 2025-03-04T14:25:06.565Z
- commits:
- subject: Decrease balenaCloud api request timeout from 15m to 59s
hash: 49163e92a013250f72ca7231e11945b465c4dd45
body: |
This was mistakenly increased due to confusion between the timeout for
requests to the supervisor's api vs the timeout for requests from the
supervisor to the balenaCloud api. This separates the two configs and
documents the difference between the timeouts whilst also decreasing
the timeout for balenaCloud api requests to the correct/expected value
footer:
Change-type: patch
change-type: patch
author: Pagan Gazzard
nested: []
version: 16.12.5
title: ""
date: 2025-03-04T13:35:26.801Z
- commits:
- subject: Don't revert to regular pull if delta server 401
hash: 2dc9d275b15a0802264bcd49e2f0dddbbadd2225
body: |
If the Supervisor receives a 401 Unauthorized from the delta server
when requesting a delta image location, we should surface the error
instead of falling back to a regular pull immediately, as there could
be an issue with the delta auth token, which refreshes after
DELTA_TOKEN_TIMEOUT (10min), or some other edge case.
footer:
Change-type: patch
change-type: patch
Signed-off-by: Christina Ying Wang <christina@balena.io>
signed-off-by: Christina Ying Wang <christina@balena.io>
author: Christina Ying Wang
nested: []
version: 16.12.4
title: ""
date: 2025-03-03T13:42:18.045Z
- commits:
- subject: Retry DELTA_APPLY_RETRY_COUNT (3) times during delta apply fail before
reverting to regular pull
hash: 341111f1f94cd9f17fd7be9b6f21e3bc22c9ad3a
body: >
This prevents an image download error loop where the delta image on the
delta server is present,
but some aspect of the delta image or the base image on the device does
not match up, causing
the delta to fail to be applied to the base image.
Delta apply errors don't raise status codes as they are thrown from the
Engine (although they should),
so if an error with a status code is raised during this time, throw an
error to the handler
indicating that the delta should be retried until success. Errors with
status codes raised during
this time are largely network related, so falling back to a regular pull
won't improve anything.
Upon delta apply errors exceeding DELTA_APPLY_RETRY_COUNT, revert to a
regular pull.
footer:
Change-type: patch
change-type: patch
Signed-off-by: Christina Ying Wang <christina@balena.io>
signed-off-by: Christina Ying Wang <christina@balena.io>
author: Christina Ying Wang
nested: []
- subject: Revert to regular pull immediately on delta server failure (code 400s)
hash: 1fc242200f78e4219aafc5bb91de8cf0916236af
body: >
If the delta server responds immediately with HTTP 4xx upon requesting a
delta image,
this means the server is not able to supply the resource, so fall back
to a regular pull
immediately.
footer:
Change-type: patch
change-type: patch
Signed-off-by: Christina Ying Wang <christina@balena.io>
signed-off-by: Christina Ying Wang <christina@balena.io>
author: Christina Ying Wang
nested: []
version: 16.12.3
title: ""
date: 2025-02-19T20:51:53.085Z
- commits:
- subject: Update balena-io/deploy-to-balena-action action to v2.0.92
hash: c57622e2264e41078e907d6ba8de9d5206bb6293
body: |
Update balena-io/deploy-to-balena-action from 2.0.74 to 2.0.92
footer:
Change-type: patch
change-type: patch
author: balena-renovate[bot]
nested: []
version: 16.12.2
title: ""
date: 2025-02-11T01:04:22.736Z
- commits: - commits:
- subject: Pin io-ts version to v2.2.20 - subject: Pin io-ts version to v2.2.20
hash: 88e821ed8e36e10d6429dc31950b5aeed968aa3f hash: 88e821ed8e36e10d6429dc31950b5aeed968aa3f

View File

@ -4,6 +4,44 @@ All notable changes to this project will be documented in this file
automatically by Versionist. DO NOT EDIT THIS FILE MANUALLY! automatically by Versionist. DO NOT EDIT THIS FILE MANUALLY!
This project adheres to [Semantic Versioning](http://semver.org/). This project adheres to [Semantic Versioning](http://semver.org/).
# v16.12.8
## (2025-03-12)
* Remove GOT retries on state poll [Felipe Lalanne]
* Ensure poll socket timeout is defined early [Felipe Lalanne]
# v16.12.7
## (2025-03-06)
* Release locks when removing apps [Felipe Lalanne]
# v16.12.6
## (2025-03-04)
* Log non-API errors during state poll [Felipe Lalanne]
* Fix target poll healthcheck [Felipe Lalanne]
# v16.12.5
## (2025-03-04)
* Decrease balenaCloud api request timeout from 15m to 59s [Pagan Gazzard]
# v16.12.4
## (2025-03-03)
* Don't revert to regular pull if delta server 401 [Christina Ying Wang]
# v16.12.3
## (2025-02-19)
* Retry DELTA_APPLY_RETRY_COUNT (3) times during delta apply fail before reverting to regular pull [Christina Ying Wang]
* Revert to regular pull immediately on delta server failure (code 400s) [Christina Ying Wang]
# v16.12.2
## (2025-02-11)
* Update balena-io/deploy-to-balena-action action to v2.0.92 [balena-renovate[bot]]
# v16.12.1 # v16.12.1
## (2025-02-10) ## (2025-02-10)

View File

@ -1 +1 @@
16.12.1 16.12.8

View File

@ -2,6 +2,6 @@ name: balena-supervisor
description: 'Balena Supervisor: balena''s agent on devices.' description: 'Balena Supervisor: balena''s agent on devices.'
joinable: false joinable: false
type: sw.application type: sw.application
version: 16.12.1 version: 16.12.8
provides: provides:
- slug: sw.compose.long-volume-syntax - slug: sw.compose.long-volume-syntax

40
package-lock.json generated
View File

@ -1,12 +1,12 @@
{ {
"name": "balena-supervisor", "name": "balena-supervisor",
"version": "16.12.1", "version": "16.12.8",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "balena-supervisor", "name": "balena-supervisor",
"version": "16.12.1", "version": "16.12.8",
"license": "Apache-2.0", "license": "Apache-2.0",
"dependencies": { "dependencies": {
"@balena/systemd": "^0.5.0", "@balena/systemd": "^0.5.0",
@ -58,13 +58,13 @@
"copy-webpack-plugin": "^12.0.0", "copy-webpack-plugin": "^12.0.0",
"deep-object-diff": "1.1.0", "deep-object-diff": "1.1.0",
"docker-delta": "^4.1.0", "docker-delta": "^4.1.0",
"docker-progress": "^5.2.3", "docker-progress": "^5.2.4",
"dockerode": "^4.0.2", "dockerode": "^4.0.2",
"duration-js": "^4.0.0", "duration-js": "^4.0.0",
"express": "^4.21.2", "express": "^4.21.2",
"fork-ts-checker-webpack-plugin": "^9.0.2", "fork-ts-checker-webpack-plugin": "^9.0.2",
"fp-ts": "^2.16.5", "fp-ts": "^2.16.5",
"got": "14.4.1", "got": "^14.4.6",
"husky": "^9.1.7", "husky": "^9.1.7",
"io-ts": "2.2.20", "io-ts": "2.2.20",
"io-ts-reporters": "^2.0.1", "io-ts-reporters": "^2.0.1",
@ -1225,13 +1225,13 @@
"license": "MIT" "license": "MIT"
}, },
"node_modules/@sindresorhus/is": { "node_modules/@sindresorhus/is": {
"version": "6.3.1", "version": "7.0.1",
"resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-6.3.1.tgz", "resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-7.0.1.tgz",
"integrity": "sha512-FX4MfcifwJyFOI2lPoX7PQxCqx8BG1HCho7WdiXwpEQx1Ycij0JxkfYtGK7yqNScrZGSlt6RE6sw8QYoH7eKnQ==", "integrity": "sha512-QWLl2P+rsCJeofkDNIT3WFmb6NrRud1SUYW8dIhXK/46XFV8Q/g7Bsvib0Askb0reRLe+WYPeeE+l5cH7SlkuQ==",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT",
"engines": { "engines": {
"node": ">=16" "node": ">=18"
}, },
"funding": { "funding": {
"url": "https://github.com/sindresorhus/is?sponsor=1" "url": "https://github.com/sindresorhus/is?sponsor=1"
@ -4794,10 +4794,11 @@
} }
}, },
"node_modules/docker-progress": { "node_modules/docker-progress": {
"version": "5.2.3", "version": "5.2.4",
"resolved": "https://registry.npmjs.org/docker-progress/-/docker-progress-5.2.3.tgz", "resolved": "https://registry.npmjs.org/docker-progress/-/docker-progress-5.2.4.tgz",
"integrity": "sha512-tsiqpC61pzaDOkKhbvr7ABQB2bL3bx+sVa7r4IZFf3tzwcMIhcU/sr5fqsXOKzIspxiCL+UHNS9gNO5ly9JxWg==", "integrity": "sha512-sgEXTJh78YOj8pIBIzZHLo3KpamJ5N0/3pU7DkpZBBvxZ9PmO0d9ND6x7TExQZf4hgvlFRBS41aN+GHx6vu5KQ==",
"dev": true, "dev": true,
"license": "Apache-2.0",
"dependencies": { "dependencies": {
"@types/dockerode": "^3.3.23", "@types/dockerode": "^3.3.23",
"JSONStream": "^1.3.5", "JSONStream": "^1.3.5",
@ -7053,24 +7054,23 @@
} }
}, },
"node_modules/got": { "node_modules/got": {
"version": "14.4.1", "version": "14.4.6",
"resolved": "https://registry.npmjs.org/got/-/got-14.4.1.tgz", "resolved": "https://registry.npmjs.org/got/-/got-14.4.6.tgz",
"integrity": "sha512-IvDJbJBUeexX74xNQuMIVgCRRuNOm5wuK+OC3Dc2pnSoh1AOmgc7JVj7WC+cJ4u0aPcO9KZ2frTXcqK4W/5qTQ==", "integrity": "sha512-rnhwfM/PhMNJ1i17k3DuDqgj0cKx3IHxBKVv/WX1uDKqrhi2Gv3l7rhPThR/Cc6uU++dD97W9c8Y0qyw9x0jag==",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@sindresorhus/is": "^6.3.1", "@sindresorhus/is": "^7.0.1",
"@szmarczak/http-timer": "^5.0.1", "@szmarczak/http-timer": "^5.0.1",
"cacheable-lookup": "^7.0.0", "cacheable-lookup": "^7.0.0",
"cacheable-request": "^12.0.1", "cacheable-request": "^12.0.1",
"decompress-response": "^6.0.0", "decompress-response": "^6.0.0",
"form-data-encoder": "^4.0.2", "form-data-encoder": "^4.0.2",
"get-stream": "^8.0.1",
"http2-wrapper": "^2.2.1", "http2-wrapper": "^2.2.1",
"lowercase-keys": "^3.0.0", "lowercase-keys": "^3.0.0",
"p-cancelable": "^4.0.1", "p-cancelable": "^4.0.1",
"responselike": "^3.0.0", "responselike": "^3.0.0",
"type-fest": "^4.19.0" "type-fest": "^4.26.1"
}, },
"engines": { "engines": {
"node": ">=20" "node": ">=20"
@ -7109,9 +7109,9 @@
} }
}, },
"node_modules/got/node_modules/type-fest": { "node_modules/got/node_modules/type-fest": {
"version": "4.20.0", "version": "4.35.0",
"resolved": "https://registry.npmjs.org/type-fest/-/type-fest-4.20.0.tgz", "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-4.35.0.tgz",
"integrity": "sha512-MBh+PHUHHisjXf4tlx0CFWoMdjx8zCMLJHOjnV1prABYZFHqtFOyauCIK2/7w4oIfwkF8iNhLtnJEfVY2vn3iw==", "integrity": "sha512-2/AwEFQDFEy30iOLjrvHDIH7e4HEWH+f1Yl1bI5XMqzuoCUqwYCdxachgsgv0og/JdVZUhbfjcJAoHj5L1753A==",
"dev": true, "dev": true,
"license": "(MIT OR CC0-1.0)", "license": "(MIT OR CC0-1.0)",
"engines": { "engines": {

View File

@ -1,7 +1,7 @@
{ {
"name": "balena-supervisor", "name": "balena-supervisor",
"description": "This is balena's Supervisor, a program that runs on IoT devices and has the task of running user Apps (which are Docker containers), and updating them as the balena API informs it to.", "description": "This is balena's Supervisor, a program that runs on IoT devices and has the task of running user Apps (which are Docker containers), and updating them as the balena API informs it to.",
"version": "16.12.1", "version": "16.12.8",
"license": "Apache-2.0", "license": "Apache-2.0",
"repository": { "repository": {
"type": "git", "type": "git",
@ -84,13 +84,13 @@
"copy-webpack-plugin": "^12.0.0", "copy-webpack-plugin": "^12.0.0",
"deep-object-diff": "1.1.0", "deep-object-diff": "1.1.0",
"docker-delta": "^4.1.0", "docker-delta": "^4.1.0",
"docker-progress": "^5.2.3", "docker-progress": "^5.2.4",
"dockerode": "^4.0.2", "dockerode": "^4.0.2",
"duration-js": "^4.0.0", "duration-js": "^4.0.0",
"express": "^4.21.2", "express": "^4.21.2",
"fork-ts-checker-webpack-plugin": "^9.0.2", "fork-ts-checker-webpack-plugin": "^9.0.2",
"fp-ts": "^2.16.5", "fp-ts": "^2.16.5",
"got": "14.4.1", "got": "^14.4.6",
"husky": "^9.1.7", "husky": "^9.1.7",
"io-ts": "2.2.20", "io-ts": "2.2.20",
"io-ts-reporters": "^2.0.1", "io-ts-reporters": "^2.0.1",
@ -137,6 +137,6 @@
"yargs": "^17.7.2" "yargs": "^17.7.2"
}, },
"versionist": { "versionist": {
"publishedAt": "2025-02-10T22:51:52.294Z" "publishedAt": "2025-03-12T14:50:33.763Z"
} }
} }

View File

@ -63,7 +63,7 @@ export async function healthcheck() {
} }
// Check last time target state has been polled // Check last time target state has been polled
const timeSinceLastFetch = process.hrtime(TargetState.lastFetch); const timeSinceLastFetch = process.hrtime(TargetState.lastSuccessfulFetch);
const timeSinceLastFetchMs = const timeSinceLastFetchMs =
timeSinceLastFetch[0] * 1000 + timeSinceLastFetch[1] / 1e6; timeSinceLastFetch[0] * 1000 + timeSinceLastFetch[1] / 1e6;

View File

@ -3,6 +3,7 @@ import url from 'url';
import { setTimeout } from 'timers/promises'; import { setTimeout } from 'timers/promises';
import Bluebird from 'bluebird'; import Bluebird from 'bluebird';
import type StrictEventEmitter from 'strict-event-emitter-types'; import type StrictEventEmitter from 'strict-event-emitter-types';
import { Agent } from 'https';
import type { TargetState } from '../types/state'; import type { TargetState } from '../types/state';
import { InternalInconsistencyError } from '../lib/errors'; import { InternalInconsistencyError } from '../lib/errors';
@ -87,7 +88,8 @@ const emitTargetState = (
* We set a value rather then being undeclared because having it undefined * We set a value rather then being undeclared because having it undefined
* adds more overhead to dealing with this value without any benefits. * adds more overhead to dealing with this value without any benefits.
*/ */
export let lastFetch: ReturnType<typeof process.hrtime> = process.hrtime(); export let lastSuccessfulFetch: ReturnType<typeof process.hrtime> =
process.hrtime();
/** /**
* Attempts to update the target state * Attempts to update the target state
@ -101,11 +103,11 @@ export const update = async (
): Promise<void> => { ): Promise<void> => {
await config.initialized(); await config.initialized();
return Bluebird.using(lockGetTarget(), async () => { return Bluebird.using(lockGetTarget(), async () => {
const { uuid, apiEndpoint, apiTimeout, deviceApiKey } = const { uuid, apiEndpoint, apiRequestTimeout, deviceApiKey } =
await config.getMany([ await config.getMany([
'uuid', 'uuid',
'apiEndpoint', 'apiEndpoint',
'apiTimeout', 'apiRequestTimeout',
'deviceApiKey', 'deviceApiKey',
]); ]);
@ -119,6 +121,13 @@ export const update = async (
const got = await getGotInstance(); const got = await getGotInstance();
const { statusCode, headers, body } = await got(endpoint, { const { statusCode, headers, body } = await got(endpoint, {
retry: { limit: 0 },
agent: {
https: new Agent({
keepAlive: true,
timeout: apiRequestTimeout,
}),
},
headers: { headers: {
Authorization: `Bearer ${deviceApiKey}`, Authorization: `Bearer ${deviceApiKey}`,
'If-None-Match': cache?.etag, 'If-None-Match': cache?.etag,
@ -126,12 +135,12 @@ export const update = async (
timeout: { timeout: {
// TODO: We use the same default timeout for all of these in order to have a timeout generally // TODO: We use the same default timeout for all of these in order to have a timeout generally
// but it would probably make sense to tune them individually // but it would probably make sense to tune them individually
lookup: apiTimeout, lookup: apiRequestTimeout,
connect: apiTimeout, connect: apiRequestTimeout,
secureConnect: apiTimeout, secureConnect: apiRequestTimeout,
socket: apiTimeout, socket: apiRequestTimeout,
send: apiTimeout, send: apiRequestTimeout,
response: apiTimeout, response: apiRequestTimeout,
}, },
}); });
@ -154,8 +163,6 @@ export const update = async (
// Emit the target state and update the cache // Emit the target state and update the cache
cache.emitted = emitTargetState(cache, force, isFromApi); cache.emitted = emitTargetState(cache, force, isFromApi);
}).finally(() => {
lastFetch = process.hrtime();
}); });
}; };
@ -188,7 +195,11 @@ const poll = async (
await update(); await update();
// Reset fetchErrors because we successfuly updated // Reset fetchErrors because we successfuly updated
fetchErrors = 0; fetchErrors = 0;
} catch { lastSuccessfulFetch = process.hrtime();
} catch (e) {
if (!(e instanceof ApiResponseError)) {
log.error('Target state poll failed', e);
}
// Exponential back off if request fails // Exponential back off if request fails
pollInterval = Math.min(appUpdatePollInterval, 15000 * 2 ** fetchErrors); pollInterval = Math.min(appUpdatePollInterval, 15000 * 2 ** fetchErrors);
++fetchErrors; ++fetchErrors;

View File

@ -41,14 +41,17 @@ export let stateReportErrors = 0;
type StateReportOpts = { type StateReportOpts = {
[key in keyof Pick< [key in keyof Pick<
config.ConfigMap<SchemaTypeKey>, config.ConfigMap<SchemaTypeKey>,
'apiEndpoint' | 'apiTimeout' | 'deviceApiKey' | 'appUpdatePollInterval' | 'apiEndpoint'
| 'apiRequestTimeout'
| 'deviceApiKey'
| 'appUpdatePollInterval'
>]: SchemaReturn<key>; >]: SchemaReturn<key>;
}; };
type StateReport = { body: Partial<DeviceState>; opts: StateReportOpts }; type StateReport = { body: Partial<DeviceState>; opts: StateReportOpts };
async function report({ body, opts }: StateReport) { async function report({ body, opts }: StateReport) {
const { apiEndpoint, apiTimeout, deviceApiKey } = opts; const { apiEndpoint, apiRequestTimeout, deviceApiKey } = opts;
if (!apiEndpoint) { if (!apiEndpoint) {
throw new InternalInconsistencyError( throw new InternalInconsistencyError(
@ -69,7 +72,7 @@ async function report({ body, opts }: StateReport) {
const [{ statusCode, body: statusMessage, headers }] = await request const [{ statusCode, body: statusMessage, headers }] = await request
.patchAsync(endpoint, params) .patchAsync(endpoint, params)
.timeout(apiTimeout); .timeout(apiRequestTimeout);
if (statusCode < 200 || statusCode >= 300) { if (statusCode < 200 || statusCode >= 300) {
throw new StatusError( throw new StatusError(
@ -203,7 +206,7 @@ export async function startReporting() {
// Get configs needed to make a report // Get configs needed to make a report
const reportConfigs = (await config.getMany([ const reportConfigs = (await config.getMany([
'apiEndpoint', 'apiEndpoint',
'apiTimeout', 'apiRequestTimeout',
'deviceApiKey', 'deviceApiKey',
'appUpdatePollInterval', 'appUpdatePollInterval',
])) as StateReportOpts; ])) as StateReportOpts;

View File

@ -247,6 +247,16 @@ class AppImpl implements App {
} }
} }
// Release locks (if any) for all services before settling state
if (state.lock || state.hasLeftoverLocks) {
return [
generateStep('releaseLock', {
appId: this.appId,
lock: state.lock,
}),
];
}
return []; return [];
} }

View File

@ -90,7 +90,7 @@ export const fnSchema = {
'deviceArch', 'deviceArch',
'deviceType', 'deviceType',
'apiEndpoint', 'apiEndpoint',
'apiTimeout', 'apiRequestTimeout',
'registered_at', 'registered_at',
'deviceId', 'deviceId',
'version', 'version',
@ -107,7 +107,7 @@ export const fnSchema = {
provisioningApiKey: conf.apiKey, provisioningApiKey: conf.apiKey,
deviceApiKey: conf.deviceApiKey, deviceApiKey: conf.deviceApiKey,
apiEndpoint: conf.apiEndpoint, apiEndpoint: conf.apiEndpoint,
apiTimeout: conf.apiTimeout, apiRequestTimeout: conf.apiRequestTimeout,
registered_at: conf.registered_at, registered_at: conf.registered_at,
deviceId: conf.deviceId, deviceId: conf.deviceId,
supervisorVersion: conf.version, supervisorVersion: conf.version,

View File

@ -12,6 +12,9 @@ export const schemaTypes = {
type: t.string, type: t.string,
default: '', default: '',
}, },
/**
* The timeout for the supervisor's api
*/
apiTimeout: { apiTimeout: {
type: PermissiveNumber, type: PermissiveNumber,
default: 15 * 60 * 1000, default: 15 * 60 * 1000,
@ -118,6 +121,13 @@ export const schemaTypes = {
type: PermissiveBoolean, type: PermissiveBoolean,
default: false, default: false,
}, },
/**
* The timeout for requests to the balenaCloud api
*/
apiRequestTimeout: {
type: PermissiveNumber,
default: 59000,
},
deltaRequestTimeout: { deltaRequestTimeout: {
type: PermissiveNumber, type: PermissiveNumber,
default: 59000, default: 59000,
@ -218,7 +228,7 @@ export const schemaTypes = {
provisioningApiKey: t.union([t.string, NullOrUndefined]), provisioningApiKey: t.union([t.string, NullOrUndefined]),
deviceApiKey: t.string, deviceApiKey: t.string,
apiEndpoint: t.string, apiEndpoint: t.string,
apiTimeout: PermissiveNumber, apiRequestTimeout: PermissiveNumber,
registered_at: t.union([PermissiveNumber, NullOrUndefined]), registered_at: t.union([PermissiveNumber, NullOrUndefined]),
deviceId: t.union([PermissiveNumber, NullOrUndefined]), deviceId: t.union([PermissiveNumber, NullOrUndefined]),
supervisorVersion: t.union([t.string, t.undefined]), supervisorVersion: t.union([t.string, t.undefined]),

View File

@ -4,6 +4,9 @@ export const schema = {
mutable: false, mutable: false,
removeIfNull: false, removeIfNull: false,
}, },
/**
* The timeout for the supervisor's api
*/
apiTimeout: { apiTimeout: {
source: 'config.json', source: 'config.json',
mutable: false, mutable: false,
@ -120,6 +123,11 @@ export const schema = {
mutable: true, mutable: true,
removeIfNull: false, removeIfNull: false,
}, },
apiRequestTimeout: {
source: 'db',
mutable: true,
removeIfNull: false,
},
delta: { delta: {
source: 'db', source: 'db',
mutable: true, mutable: true,

View File

@ -141,6 +141,11 @@ const configKeys: Dictionary<ConfigOption> = {
varType: 'bool', varType: 'bool',
defaultValue: 'true', defaultValue: 'true',
}, },
apiRequestTimeout: {
envVarName: 'SUPERVISOR_API_REQUEST_TIMEOUT',
varType: 'int',
defaultValue: '59000',
},
delta: { delta: {
envVarName: 'SUPERVISOR_DELTA', envVarName: 'SUPERVISOR_DELTA',
varType: 'bool', varType: 'bool',

View File

@ -111,10 +111,10 @@ export const exchangeKeyAndGetDevice = async (
opts: Partial<KeyExchangeOpts>, opts: Partial<KeyExchangeOpts>,
): Promise<Device> => { ): Promise<Device> => {
const uuid = opts.uuid; const uuid = opts.uuid;
const apiTimeout = opts.apiTimeout; const apiRequestTimeout = opts.apiRequestTimeout;
if (!(uuid && apiTimeout)) { if (!(uuid && apiRequestTimeout)) {
throw new InternalInconsistencyError( throw new InternalInconsistencyError(
'UUID and apiTimeout should be defined in exchangeKeyAndGetDevice', 'UUID and apiRequestTimeout should be defined in exchangeKeyAndGetDevice',
); );
} }
@ -122,7 +122,12 @@ export const exchangeKeyAndGetDevice = async (
// valid, because if it is then we can just use that // valid, because if it is then we can just use that
if (opts.deviceApiKey != null) { if (opts.deviceApiKey != null) {
try { try {
return await fetchDevice(balenaApi, uuid, opts.deviceApiKey, apiTimeout); return await fetchDevice(
balenaApi,
uuid,
opts.deviceApiKey,
apiRequestTimeout,
);
} catch (e) { } catch (e) {
if (e instanceof DeviceNotFoundError) { if (e instanceof DeviceNotFoundError) {
// do nothing... // do nothing...
@ -146,7 +151,7 @@ export const exchangeKeyAndGetDevice = async (
balenaApi, balenaApi,
uuid, uuid,
opts.provisioningApiKey, opts.provisioningApiKey,
apiTimeout, apiRequestTimeout,
); );
} catch { } catch {
throw new ExchangeKeyError(`Couldn't fetch device with provisioning key`); throw new ExchangeKeyError(`Couldn't fetch device with provisioning key`);
@ -165,7 +170,7 @@ export const exchangeKeyAndGetDevice = async (
Authorization: `Bearer ${opts.provisioningApiKey}`, Authorization: `Bearer ${opts.provisioningApiKey}`,
}, },
}) })
.timeout(apiTimeout); .timeout(apiRequestTimeout);
if (res.statusCode !== 200) { if (res.statusCode !== 200) {
throw new ExchangeKeyError( throw new ExchangeKeyError(
@ -220,7 +225,7 @@ export const provision = async (
osVariant: opts.osVariant, osVariant: opts.osVariant,
macAddress: opts.macAddress, macAddress: opts.macAddress,
}), }),
).timeout(opts.apiTimeout); ).timeout(opts.apiRequestTimeout);
} catch (err) { } catch (err) {
if ( if (
err instanceof deviceRegister.ApiError && err instanceof deviceRegister.ApiError &&

View File

@ -1,22 +1,23 @@
import type { ProgressCallback } from 'docker-progress';
import { DockerProgress } from 'docker-progress'; import { DockerProgress } from 'docker-progress';
import type { ProgressCallback } from 'docker-progress';
import Dockerode from 'dockerode'; import Dockerode from 'dockerode';
import _ from 'lodash'; import _ from 'lodash';
import memoizee from 'memoizee'; import memoizee from 'memoizee';
import { applyDelta, OutOfSyncError } from 'docker-delta'; import { applyDelta, OutOfSyncError } from 'docker-delta';
import type { SchemaReturn } from '../config/schema-type'; import log from './supervisor-console';
import { envArrayToObject } from './conversions'; import { envArrayToObject } from './conversions';
import * as request from './request';
import { import {
DeltaStillProcessingError, DeltaStillProcessingError,
ImageAuthenticationError, ImageAuthenticationError,
InvalidNetGatewayError, InvalidNetGatewayError,
DeltaServerError,
DeltaApplyError,
isStatusError,
} from './errors'; } from './errors';
import * as request from './request';
import type { EnvVarObject } from '../types'; import type { EnvVarObject } from '../types';
import type { SchemaReturn } from '../config/schema-type';
import log from './supervisor-console';
export type FetchOptions = SchemaReturn<'fetchOptions'>; export type FetchOptions = SchemaReturn<'fetchOptions'>;
export type DeltaFetchOptions = FetchOptions & { export type DeltaFetchOptions = FetchOptions & {
@ -41,6 +42,18 @@ type ImageNameParts = {
// (10 mins) // (10 mins)
const DELTA_TOKEN_TIMEOUT = 10 * 60 * 1000; const DELTA_TOKEN_TIMEOUT = 10 * 60 * 1000;
// How many times to retry a v3 delta apply before falling back to a regular pull.
// A delta is applied to the base image when pulling, so a failure could be due to
// "layers from manifest don't match image configuration", which can occur before
// or after downloading delta image layers.
//
// Other causes of failure have not been documented as clearly as "layers from manifest"
// but could manifest as well, though unclear if they occur before, after, or during
// downloading delta image layers.
//
// See: https://github.com/balena-os/balena-engine/blob/master/distribution/pull_v2.go#L43
const DELTA_APPLY_RETRY_COUNT = 3;
export const docker = new Dockerode(); export const docker = new Dockerode();
export const dockerProgress = new DockerProgress({ export const dockerProgress = new DockerProgress({
docker, docker,
@ -113,11 +126,7 @@ export async function fetchDeltaWithProgress(
onProgress: ProgressCallback, onProgress: ProgressCallback,
serviceName: string, serviceName: string,
): Promise<string> { ): Promise<string> {
const deltaSourceId = const deltaSourceId = deltaOpts.deltaSourceId ?? deltaOpts.deltaSource;
deltaOpts.deltaSourceId != null
? deltaOpts.deltaSourceId
: deltaOpts.deltaSource;
const timeout = deltaOpts.deltaApplyTimeout; const timeout = deltaOpts.deltaApplyTimeout;
const logFn = (str: string) => const logFn = (str: string) =>
@ -143,7 +152,7 @@ export async function fetchDeltaWithProgress(
} }
// Since the supevisor never calls this function with a source anymore, // Since the supevisor never calls this function with a source anymore,
// this should never happen, but w ehandle it anyway // this should never happen, but we handle it anyway
if (deltaOpts.deltaSource == null) { if (deltaOpts.deltaSource == null) {
logFn('Falling back to regular pull due to lack of a delta source'); logFn('Falling back to regular pull due to lack of a delta source');
return fetchImageWithProgress(imgDest, deltaOpts, onProgress); return fetchImageWithProgress(imgDest, deltaOpts, onProgress);
@ -210,6 +219,18 @@ export async function fetchDeltaWithProgress(
} }
break; break;
case 3: case 3:
// If 400s status code, throw a more specific error & revert immediately to a regular pull,
// unless the code is 401 Unauthorized, in which case we should surface the error by retrying
// the delta server request, instead of falling back to a regular pull immediately.
if (res.statusCode >= 400 && res.statusCode < 500) {
if (res.statusCode === 401) {
throw new Error(
`Got ${res.statusCode} when requesting an image from delta server: ${res.statusMessage}`,
);
} else {
throw new DeltaServerError(res.statusCode, res.statusMessage);
}
}
if (res.statusCode !== 200) { if (res.statusCode !== 200) {
throw new Error( throw new Error(
`Got ${res.statusCode} when requesting v3 delta from delta server.`, `Got ${res.statusCode} when requesting v3 delta from delta server.`,
@ -225,24 +246,62 @@ export async function fetchDeltaWithProgress(
`Got an error when parsing delta server response for v3 delta: ${e}`, `Got an error when parsing delta server response for v3 delta: ${e}`,
); );
} }
id = await applyBalenaDelta(name, token, onProgress, logFn); // Try to apply delta DELTA_APPLY_RETRY_COUNT times, then throw DeltaApplyError
let lastError: Error | undefined = undefined;
for (
let tryCount = 0;
tryCount < DELTA_APPLY_RETRY_COUNT;
tryCount++
) {
try {
id = await applyBalenaDelta(name, token, onProgress, logFn);
break;
} catch (e) {
if (isStatusError(e)) {
// A status error during delta pull indicates network issues,
// so we should throw an error to the handler that indicates that
// the delta pull should be retried until network issues are resolved,
// rather than falling back to a regular pull.
throw e;
}
lastError = e as Error;
logFn(
`Delta apply failed, retrying (${tryCount + 1}/${DELTA_APPLY_RETRY_COUNT})...`,
);
}
}
if (lastError) {
throw new DeltaApplyError(lastError.message);
}
} }
break; break;
default: default:
throw new Error(`Unsupported delta version: ${deltaOpts.deltaVersion}`); throw new Error(`Unsupported delta version: ${deltaOpts.deltaVersion}`);
} }
} catch (e) { } catch (e) {
// Log appropriate message based on error type
if (e instanceof OutOfSyncError) { if (e instanceof OutOfSyncError) {
logFn('Falling back to regular pull due to delta out of sync error'); logFn('Falling back to regular pull due to delta out of sync error');
return await fetchImageWithProgress(imgDest, deltaOpts, onProgress); } else if (e instanceof DeltaServerError) {
logFn(
`Falling back to regular pull due to delta server error (${e.statusCode})${e.statusMessage ? `: ${e.statusMessage}` : ''}`,
);
} else if (e instanceof DeltaApplyError) {
// A delta apply error is raised from the Engine and doesn't have a status code
logFn(
`Falling back to regular pull due to delta apply error ${e.message ? `: ${e.message}` : ''}`,
);
} else { } else {
logFn(`Delta failed with ${e}`); logFn(`Delta failed with ${e}`);
throw e; throw e;
} }
// For handled errors, fall back to regular pull
return fetchImageWithProgress(imgDest, deltaOpts, onProgress);
} }
logFn(`Delta applied successfully`); logFn(`Delta applied successfully`);
return id; return id!;
} }
export async function fetchImageWithProgress( export async function fetchImageWithProgress(

View File

@ -70,6 +70,13 @@ export class InvalidNetGatewayError extends TypedError {}
export class DeltaStillProcessingError extends TypedError {} export class DeltaStillProcessingError extends TypedError {}
export class DeltaServerError extends StatusError {}
export class DeltaApplyError extends Error {
constructor(message?: string) {
super(message);
}
}
export class UpdatesLockedError extends TypedError {} export class UpdatesLockedError extends TypedError {}
export function isHttpConflictError(err: { statusCode: number }): boolean { export function isHttpConflictError(err: { statusCode: number }): boolean {

View File

@ -84,6 +84,7 @@ describe('device-config', () => {
SUPERVISOR_LOCAL_MODE: 'false', SUPERVISOR_LOCAL_MODE: 'false',
SUPERVISOR_CONNECTIVITY_CHECK: 'true', SUPERVISOR_CONNECTIVITY_CHECK: 'true',
SUPERVISOR_LOG_CONTROL: 'true', SUPERVISOR_LOG_CONTROL: 'true',
SUPERVISOR_API_REQUEST_TIMEOUT: '59000',
SUPERVISOR_DELTA: 'false', SUPERVISOR_DELTA: 'false',
SUPERVISOR_DELTA_REQUEST_TIMEOUT: '59000', SUPERVISOR_DELTA_REQUEST_TIMEOUT: '59000',
SUPERVISOR_DELTA_APPLY_TIMEOUT: '0', SUPERVISOR_DELTA_APPLY_TIMEOUT: '0',

View File

@ -335,7 +335,7 @@ describe('ApiBinder', () => {
before(async () => { before(async () => {
await initModels(components, '/config-apibinder.json'); await initModels(components, '/config-apibinder.json');
previousLastFetch = TargetState.lastFetch; previousLastFetch = TargetState.lastSuccessfulFetch;
}); });
after(async () => { after(async () => {

View File

@ -2399,5 +2399,19 @@ describe('compose/app', () => {
const [releaseLockStep] = expectSteps('releaseLock', steps, 1); const [releaseLockStep] = expectSteps('releaseLock', steps, 1);
expect(releaseLockStep).to.have.property('appId').that.equals(1); expect(releaseLockStep).to.have.property('appId').that.equals(1);
}); });
it('should infer a releaseLock step when removing an app', async () => {
const current = createApp({
services: [],
networks: [],
});
const steps = current.stepsToRemoveApp({
...defaultContext,
lock: mockLock,
});
const [releaseLockStep] = expectSteps('releaseLock', steps, 1);
expect(releaseLockStep).to.have.property('appId').that.equals(1);
});
}); });
}); });