diff --git a/.changeset/calm-geese-obey.md b/.changeset/calm-geese-obey.md new file mode 100644 index 0000000000..b45295f8ff --- /dev/null +++ b/.changeset/calm-geese-obey.md @@ -0,0 +1,7 @@ +--- +'@backstage/plugin-kubernetes': minor +--- + +refactor kubernetes error detection to make way for proposed solutions + +**BREAKING**: `DetectedError` now appears once per Kubernetes resource per error instead of for all resources which have that error, `namespace` and `name` fields are now in `sourceRef` object `message` is now a `string` instead of a `string[]`. `ErrorDetectableKind` has been removed. diff --git a/plugins/kubernetes/api-report.md b/plugins/kubernetes/api-report.md index 2cc2674a9e..ad1217e257 100644 --- a/plugins/kubernetes/api-report.md +++ b/plugins/kubernetes/api-report.md @@ -104,17 +104,21 @@ export interface DeploymentResources { // @public export interface DetectedError { // (undocumented) - cluster: string; + message: string; // (undocumented) - kind: ErrorDetectableKind; + occuranceCount: number; + // Warning: (ae-forgotten-export) The symbol "ProposedFix" needs to be exported by the entry point index.d.ts + // // (undocumented) - message: string[]; - // (undocumented) - names: string[]; - // (undocumented) - namespace: string; + proposedFix: ProposedFix[]; // (undocumented) severity: ErrorSeverity; + // Warning: (ae-forgotten-export) The symbol "ResourceRef" needs to be exported by the entry point index.d.ts + // + // (undocumented) + sourceRef: ResourceRef; + // (undocumented) + type: string; } // @public @@ -137,12 +141,6 @@ export type EntityKubernetesContentProps = { refreshIntervalMs?: number; }; -// @public -export type ErrorDetectableKind = - | 'Pod' - | 'Deployment' - | 'HorizontalPodAutoscaler'; - // Warning: (ae-forgotten-export) The symbol "ErrorPanelProps" needs to be exported by the entry point index.d.ts // Warning: (ae-missing-release-tag) "ErrorPanel" is part of the package's API, but it is missing a release tag (@alpha, @beta, @public, or @internal) // diff --git a/plugins/kubernetes/package.json b/plugins/kubernetes/package.json index 7b3b3d7b25..edff7348a4 100644 --- a/plugins/kubernetes/package.json +++ b/plugins/kubernetes/package.json @@ -47,6 +47,7 @@ "@types/react": "^16.13.1 || ^17.0.0", "cronstrue": "^2.2.0", "js-yaml": "^4.0.0", + "kubernetes-models": "^4.1.0", "lodash": "^4.17.21", "luxon": "^3.0.0", "react-use": "^17.2.4" diff --git a/plugins/kubernetes/src/components/ErrorReporting/ErrorReporting.tsx b/plugins/kubernetes/src/components/ErrorReporting/ErrorReporting.tsx index a99fc21902..367e791067 100644 --- a/plugins/kubernetes/src/components/ErrorReporting/ErrorReporting.tsx +++ b/plugins/kubernetes/src/components/ErrorReporting/ErrorReporting.tsx @@ -15,84 +15,64 @@ */ import * as React from 'react'; import { DetectedError, DetectedErrorsByCluster } from '../../error-detection'; -import { Chip } from '@material-ui/core'; import { Table, TableColumn } from '@backstage/core-components'; type ErrorReportingProps = { detectedErrors: DetectedErrorsByCluster; }; -const columns: TableColumn[] = [ +const columns: TableColumn[] = [ { title: 'cluster', width: '10%', - render: (detectedError: DetectedError) => detectedError.cluster, + render: (row: Row) => row.clusterName, }, { title: 'namespace', width: '10%', - render: (detectedError: DetectedError) => detectedError.namespace, + render: (row: Row) => row.error.sourceRef.namespace, }, { title: 'kind', width: '10%', - render: (detectedError: DetectedError) => detectedError.kind, + render: (row: Row) => row.error.sourceRef.kind, }, { title: 'name', width: '30%', - render: (detectedError: DetectedError) => { - const errorCount = detectedError.names.length; - - if (errorCount === 0) { - // This shouldn't happen - return null; - } - - const displayName = detectedError.names[0]; - - const otherErrorCount = errorCount - 1; - - return ( - <> - {displayName}{' '} - {otherErrorCount > 0 && ( - 1 ? 's' : '' - }`} - size="small" - /> - )} - - ); + render: (row: Row) => { + return <>{row.error.sourceRef.name} ; }, }, { title: 'messages', width: '40%', - render: (detectedError: DetectedError) => ( - <> - {detectedError.message.map((m, i) => ( -
{m}
- ))} - - ), + render: (row: Row) => row.error.message, }, ]; -const sortBySeverity = (a: DetectedError, b: DetectedError) => { - if (a.severity < b.severity) { +interface Row { + clusterName: string; + error: DetectedError; +} + +const sortBySeverity = (a: Row, b: Row) => { + if (a.error.severity < b.error.severity) { return 1; - } else if (b.severity < a.severity) { + } else if (b.error.severity < a.error.severity) { return -1; } return 0; }; export const ErrorReporting = ({ detectedErrors }: ErrorReportingProps) => { - const errors = Array.from(detectedErrors.values()) - .flat() + const errors = Array.from(detectedErrors.entries()) + .flatMap(([clusterName, resourceErrors]) => { + return resourceErrors.map(e => ({ + clusterName, + error: e, + })); + }) .sort(sortBySeverity); return ( diff --git a/plugins/kubernetes/src/components/KubernetesContent.tsx b/plugins/kubernetes/src/components/KubernetesContent.tsx index a42e9fbb0b..63aa52f1ba 100644 --- a/plugins/kubernetes/src/components/KubernetesContent.tsx +++ b/plugins/kubernetes/src/components/KubernetesContent.tsx @@ -114,9 +114,8 @@ export const KubernetesContent = ({ const podsWithErrors = new Set( detectedErrors .get(item.cluster.name) - ?.filter(de => de.kind === 'Pod') - .map(de => de.names) - .flat() ?? [], + ?.filter(de => de.sourceRef.kind === 'Pod') + .map(de => de.sourceRef.name), ); return ( diff --git a/plugins/kubernetes/src/error-detection/__fixtures__/pod-crashing.json b/plugins/kubernetes/src/error-detection/__fixtures__/pod-crashing.json index 79acd2900d..bff3347813 100644 --- a/plugins/kubernetes/src/error-detection/__fixtures__/pod-crashing.json +++ b/plugins/kubernetes/src/error-detection/__fixtures__/pod-crashing.json @@ -185,7 +185,7 @@ }, "name": "other-side-car", "ready": false, - "restartCount": 38, + "restartCount": 123, "started": false, "state": { "waiting": { diff --git a/plugins/kubernetes/src/error-detection/common.ts b/plugins/kubernetes/src/error-detection/common.ts index bc5fd2fc40..381f57b71a 100644 --- a/plugins/kubernetes/src/error-detection/common.ts +++ b/plugins/kubernetes/src/error-detection/common.ts @@ -14,59 +14,15 @@ * limitations under the License. */ -import { - DetectedError, - ErrorDetectable, - ErrorDetectableKind, - ErrorMapper, -} from './types'; +import { DetectedError, ErrorMapper } from './types'; // Run through the each error mapper for each object // returning a deduplicated (mostly) result -export const detectErrorsInObjects = ( +export const detectErrorsInObjects = ( objects: T[], - kind: ErrorDetectableKind, - clusterName: string, errorMappers: ErrorMapper[], ): DetectedError[] => { - // Build up a map of errors - // key: the joined message produced by an error - // value: the error - const errors = new Map(); - - for (const object of objects) { - for (const errorMapper of errorMappers) { - if (errorMapper.errorExists(object)) { - const message = errorMapper.messageAccessor(object); - - // TODO This is not perfect as errors with uuid/hashes/date/times will not be caught by this - const dedupKey = message.join(''); - - const value = errors.get(dedupKey); - - const name = object.metadata?.name ?? 'unknown'; - const namespace = object.metadata?.namespace ?? 'unknown'; - - if (value !== undefined) { - // This gets translated into the Chip "+5 others" - // in the ErrorReporting component - // but we need to keep the names so we can easily - // find which objects owns the error later - value.names.push(name); - errors.set(dedupKey, value); - } else { - errors.set(dedupKey, { - cluster: clusterName, - kind: kind, - names: [name], - message: message, - severity: errorMapper.severity, - namespace, - }); - } - } - } - } - - return Array.from(errors.values()); + return objects.flatMap(o => { + return errorMappers.flatMap(em => em.detectErrors(o)); + }); }; diff --git a/plugins/kubernetes/src/error-detection/deployments.ts b/plugins/kubernetes/src/error-detection/deployments.ts index 896cd231ff..7807110224 100644 --- a/plugins/kubernetes/src/error-detection/deployments.ts +++ b/plugins/kubernetes/src/error-detection/deployments.ts @@ -15,35 +15,33 @@ */ import { DetectedError, ErrorMapper } from './types'; -import { V1Deployment } from '@kubernetes/client-node'; +import { Deployment } from 'kubernetes-models/apps/v1'; import { detectErrorsInObjects } from './common'; -const deploymentErrorMappers: ErrorMapper[] = [ +const deploymentErrorMappers: ErrorMapper[] = [ { - // this is probably important - severity: 6, - errorExplanation: 'condition-message-present', - errorExists: deployment => { - return (deployment.status?.conditions ?? []) - .filter(c => c.status === 'False') - .some(c => c.message !== undefined); - }, - messageAccessor: deployment => { + detectErrors: deployment => { return (deployment.status?.conditions ?? []) .filter(c => c.status === 'False') .filter(c => c.message !== undefined) - .map(c => c.message ?? ''); + .map(c => ({ + type: 'condition-message-present', + message: c.message ?? '', + severity: 6, + proposedFix: [], // TODO next PR + sourceRef: { + name: deployment.metadata?.name ?? 'unknown hpa', + namespace: deployment.metadata?.namespace ?? 'unknown namespace', + kind: 'Deployment', + apiGroup: 'apps/v1', + }, + occuranceCount: 1, + })); }, }, ]; export const detectErrorsInDeployments = ( - deployments: V1Deployment[], - clusterName: string, + deployments: Deployment[], ): DetectedError[] => - detectErrorsInObjects( - deployments, - 'Deployment', - clusterName, - deploymentErrorMappers, - ); + detectErrorsInObjects(deployments, deploymentErrorMappers); diff --git a/plugins/kubernetes/src/error-detection/error-detection.test.ts b/plugins/kubernetes/src/error-detection/error-detection.test.ts index 2c6373f32d..e3b9e91f6b 100644 --- a/plugins/kubernetes/src/error-detection/error-detection.test.ts +++ b/plugins/kubernetes/src/error-detection/error-detection.test.ts @@ -147,51 +147,61 @@ describe('detectErrors', () => { const [err1, err2, err3, err4] = errors ?? []; expect(err1).toStrictEqual({ - cluster: 'cluster-a', - kind: 'Pod', - message: [ - 'container=other-side-car restarted 38 times', - 'container=side-car restarted 38 times', - ], - names: ['dice-roller-canary-7d64cd756c-55rfq'], - namespace: 'default', + sourceRef: { + apiGroup: 'v1', + kind: 'Pod', + name: 'dice-roller-canary-7d64cd756c-55rfq', + namespace: 'default', + }, + message: + 'back-off 5m0s restarting failed container=other-side-car pod=dice-roller-canary-7d64cd756c-55rfq_default(65ad28e3-5d51-4b4b-9bf8-4cb069803034)', severity: 4, + occuranceCount: 1, + proposedFix: [], + type: 'container-waiting', }); expect(err2).toStrictEqual({ - cluster: 'cluster-a', - kind: 'Pod', - message: [ - 'containers with unready status: [side-car other-side-car]', - 'containers with unready status: [side-car other-side-car]', - ], - names: ['dice-roller-canary-7d64cd756c-55rfq'], - namespace: 'default', - severity: 5, + sourceRef: { + apiGroup: 'v1', + kind: 'Pod', + name: 'dice-roller-canary-7d64cd756c-55rfq', + namespace: 'default', + }, + message: + 'back-off 5m0s restarting failed container=side-car pod=dice-roller-canary-7d64cd756c-55rfq_default(65ad28e3-5d51-4b4b-9bf8-4cb069803034)', + severity: 4, + occuranceCount: 1, + proposedFix: [], + type: 'container-waiting', }); expect(err3).toStrictEqual({ - cluster: 'cluster-a', - kind: 'Pod', - message: [ - 'back-off 5m0s restarting failed container=other-side-car pod=dice-roller-canary-7d64cd756c-55rfq_default(65ad28e3-5d51-4b4b-9bf8-4cb069803034)', - 'back-off 5m0s restarting failed container=side-car pod=dice-roller-canary-7d64cd756c-55rfq_default(65ad28e3-5d51-4b4b-9bf8-4cb069803034)', - ], - names: ['dice-roller-canary-7d64cd756c-55rfq'], - namespace: 'default', - severity: 6, + sourceRef: { + apiGroup: 'v1', + kind: 'Pod', + name: 'dice-roller-canary-7d64cd756c-55rfq', + namespace: 'default', + }, + message: 'container=other-side-car restarted 123 times', + severity: 4, + occuranceCount: 123, + proposedFix: [], + type: 'containers-restarting', }); expect(err4).toStrictEqual({ - cluster: 'cluster-a', - kind: 'Pod', - message: [ - 'container=other-side-car exited with error code (1)', - 'container=side-car exited with error code (1)', - ], - names: ['dice-roller-canary-7d64cd756c-55rfq'], - namespace: 'default', + sourceRef: { + apiGroup: 'v1', + kind: 'Pod', + name: 'dice-roller-canary-7d64cd756c-55rfq', + namespace: 'default', + }, + message: 'container=side-car restarted 38 times', severity: 4, + occuranceCount: 38, + proposedFix: [], + type: 'containers-restarting', }); }); it('should detect errors in pod with missing Config Map', () => { @@ -202,29 +212,22 @@ describe('detectErrors', () => { const errors = result.get(CLUSTER_NAME); expect(errors).toBeDefined(); - expect(errors).toHaveLength(2); + expect(errors).toHaveLength(1); - const [err1, err2] = errors ?? []; + const [err1] = errors ?? []; expect(err1).toStrictEqual({ - cluster: 'cluster-a', - kind: 'Pod', - message: [ - 'containers with unready status: [nginx]', - 'containers with unready status: [nginx]', - ], - names: ['dice-roller-bad-cm-855bf85464-mg6xb'], - namespace: 'default', - severity: 5, - }); - - expect(err2).toStrictEqual({ - cluster: 'cluster-a', - kind: 'Pod', - message: ['configmap "some-cm" not found'], - names: ['dice-roller-bad-cm-855bf85464-mg6xb'], - namespace: 'default', - severity: 6, + message: 'configmap "some-cm" not found', + occuranceCount: 1, + proposedFix: [], + severity: 4, + sourceRef: { + apiGroup: 'v1', + kind: 'Pod', + name: 'dice-roller-bad-cm-855bf85464-mg6xb', + namespace: 'default', + }, + type: 'container-waiting', }); }); it('should detect no errors in healthy deployment', () => { @@ -250,12 +253,17 @@ describe('detectErrors', () => { const [err1] = errors ?? []; expect(err1).toStrictEqual({ - cluster: 'cluster-a', - kind: 'Deployment', - message: ['Deployment does not have minimum availability.'], - names: ['dice-roller-canary'], - namespace: 'default', + sourceRef: { + apiGroup: 'apps/v1', + kind: 'Deployment', + name: 'dice-roller-canary', + namespace: 'default', + }, + message: 'Deployment does not have minimum availability.', severity: 6, + occuranceCount: 1, + proposedFix: [], + type: 'condition-message-present', }); }); it('should detect no errors in healthy hpa', () => { @@ -281,14 +289,159 @@ describe('detectErrors', () => { const [err1] = errors ?? []; expect(err1).toStrictEqual({ - cluster: 'cluster-a', - kind: 'HorizontalPodAutoscaler', - message: [ + sourceRef: { + apiGroup: 'autoscaling/v1', + kind: 'HorizontalPodAutoscaler', + name: 'dice-roller', + namespace: 'default', + }, + message: 'Current number of replicas (10) is equal to the configured max number of replicas (10)', - ], - names: ['dice-roller'], - namespace: 'default', severity: 8, + occuranceCount: 1, + proposedFix: [], + type: 'hpa-max-current-replicas', + }); + }); + it('pending pod is not an error', async () => { + const expiredReadiness = new Date(); + expiredReadiness.setFullYear(expiredReadiness.getFullYear() - 1); + const result = await detectErrors( + onePod({ + spec: { + containers: [ + { + name: 'some-container', + readinessProbe: { + initialDelaySeconds: 20000, + failureThreshold: 5, + periodSeconds: 5, + }, + }, + ], + }, + status: { + containerStatuses: [ + { + name: 'some-container', + image: 'some-image', + imageID: 'some-image-id', + restartCount: 0, + containerID: 'running-container', + ready: false, + state: { + running: { + startedAt: new Date().toISOString() as any, + }, + }, + }, + ], + message: 'Container running', + }, + }), + ); + + const errors = result.get(CLUSTER_NAME); + + expect(errors).toBeDefined(); + expect(errors).toHaveLength(0); + }); + it('no probe pod has no errors', async () => { + const expiredReadiness = new Date(); + expiredReadiness.setFullYear(expiredReadiness.getFullYear() - 1); + const result = await detectErrors( + onePod({ + spec: { + containers: [ + { + name: 'some-container', + }, + ], + }, + status: { + containerStatuses: [ + { + name: 'some-container', + image: 'some-image', + imageID: 'some-image-id', + restartCount: 0, + containerID: 'running-container', + ready: false, + state: { + running: { + startedAt: new Date().toISOString() as any, + }, + }, + }, + ], + message: 'Container running', + }, + }), + ); + + const errors = result.get(CLUSTER_NAME); + + expect(errors).toBeDefined(); + expect(errors).toHaveLength(0); + }); + it('readiness probe failure results in error', async () => { + const expiredReadiness = new Date(); + expiredReadiness.setFullYear(expiredReadiness.getFullYear() - 1); + const result = await detectErrors( + onePod({ + spec: { + containers: [ + { + name: 'some-container', + readinessProbe: { + initialDelaySeconds: 20, + failureThreshold: 5, + periodSeconds: 5, + }, + }, + ], + }, + status: { + containerStatuses: [ + { + name: 'some-container', + image: 'some-image', + imageID: 'some-image-id', + restartCount: 0, + containerID: 'running-container', + ready: false, + state: { + running: { + startedAt: expiredReadiness.toISOString() as any, + }, + }, + }, + ], + message: 'Container running', + }, + }), + ); + + const errors = result.get(CLUSTER_NAME); + + expect(errors).toBeDefined(); + expect(errors).toHaveLength(1); + + const [err1] = errors ?? []; + + expect(err1).toStrictEqual({ + message: + 'The container some-container failed to start properly, but is not crashing', + occuranceCount: 1, + proposedFix: [], + severity: 4, + sourceRef: { + apiGroup: 'v1', + kind: 'Pod', + name: 'unknown pod', + namespace: 'unknown namespace', + }, + type: 'readiness-probe-taking-too-long', }); }); }); diff --git a/plugins/kubernetes/src/error-detection/error-detection.ts b/plugins/kubernetes/src/error-detection/error-detection.ts index fddd4d838a..c33de18b80 100644 --- a/plugins/kubernetes/src/error-detection/error-detection.ts +++ b/plugins/kubernetes/src/error-detection/error-detection.ts @@ -20,6 +20,9 @@ import { groupResponses } from '../utils/response'; import { detectErrorsInPods } from './pods'; import { detectErrorsInDeployments } from './deployments'; import { detectErrorsInHpa } from './hpas'; +import { Deployment } from 'kubernetes-models/apps/v1'; +import { HorizontalPodAutoscaler } from 'kubernetes-models/autoscaling/v1'; +import { Pod } from 'kubernetes-models/v1'; /** * For each cluster try to find errors in each of the object types provided @@ -38,20 +41,16 @@ export const detectErrors = ( const groupedResponses = groupResponses(clusterResponse.resources); clusterErrors = clusterErrors.concat( - detectErrorsInPods(groupedResponses.pods, clusterResponse.cluster.name), + detectErrorsInPods(groupedResponses.pods as Pod[]), ); clusterErrors = clusterErrors.concat( - detectErrorsInDeployments( - groupedResponses.deployments, - clusterResponse.cluster.name, - ), + detectErrorsInDeployments(groupedResponses.deployments as Deployment[]), ); clusterErrors = clusterErrors.concat( detectErrorsInHpa( - groupedResponses.horizontalPodAutoscalers, - clusterResponse.cluster.name, + groupedResponses.horizontalPodAutoscalers as HorizontalPodAutoscaler[], ), ); diff --git a/plugins/kubernetes/src/error-detection/hpas.ts b/plugins/kubernetes/src/error-detection/hpas.ts index dd4674a7d0..2dc5e30ce6 100644 --- a/plugins/kubernetes/src/error-detection/hpas.ts +++ b/plugins/kubernetes/src/error-detection/hpas.ts @@ -14,37 +14,39 @@ * limitations under the License. */ -import { V1HorizontalPodAutoscaler } from '@kubernetes/client-node'; +import { HorizontalPodAutoscaler } from 'kubernetes-models/autoscaling/v1'; import { DetectedError, ErrorMapper } from './types'; import { detectErrorsInObjects } from './common'; -const hpaErrorMappers: ErrorMapper[] = [ +const hpaErrorMappers: ErrorMapper[] = [ { - // this is probably important - severity: 8, - errorExplanation: 'hpa-max-current-replicas', - errorExists: hpa => { - return (hpa.spec?.maxReplicas ?? -1) === hpa.status?.currentReplicas; - }, - messageAccessor: hpa => { - return [ - `Current number of replicas (${ - hpa.status?.currentReplicas - }) is equal to the configured max number of replicas (${ - hpa.spec?.maxReplicas ?? -1 - })`, - ]; + detectErrors: hpa => { + if ((hpa.spec?.maxReplicas ?? -1) === hpa.status?.currentReplicas) { + return [ + { + type: 'hpa-max-current-replicas', + message: `Current number of replicas (${ + hpa.status?.currentReplicas + }) is equal to the configured max number of replicas (${ + hpa.spec?.maxReplicas ?? -1 + })`, + severity: 8, + proposedFix: [], // TODO next PR + sourceRef: { + name: hpa.metadata?.name ?? 'unknown hpa', + namespace: hpa.metadata?.namespace ?? 'unknown namespace', + kind: 'HorizontalPodAutoscaler', + apiGroup: 'autoscaling/v1', + }, + occuranceCount: 1, + }, + ]; + } + return []; }, }, ]; export const detectErrorsInHpa = ( - hpas: V1HorizontalPodAutoscaler[], - clusterName: string, -): DetectedError[] => - detectErrorsInObjects( - hpas, - 'HorizontalPodAutoscaler', - clusterName, - hpaErrorMappers, - ); + hpas: HorizontalPodAutoscaler[], +): DetectedError[] => detectErrorsInObjects(hpas, hpaErrorMappers); diff --git a/plugins/kubernetes/src/error-detection/index.ts b/plugins/kubernetes/src/error-detection/index.ts index ced03066a7..67bfb70f3b 100644 --- a/plugins/kubernetes/src/error-detection/index.ts +++ b/plugins/kubernetes/src/error-detection/index.ts @@ -15,7 +15,6 @@ */ export type { - ErrorDetectableKind, DetectedError, DetectedErrorsByCluster, ErrorSeverity, diff --git a/plugins/kubernetes/src/error-detection/pods.ts b/plugins/kubernetes/src/error-detection/pods.ts index 1ad6e3485f..a85276eaa2 100644 --- a/plugins/kubernetes/src/error-detection/pods.ts +++ b/plugins/kubernetes/src/error-detection/pods.ts @@ -14,82 +14,121 @@ * limitations under the License. */ -import { V1Pod } from '@kubernetes/client-node'; -import { totalRestarts } from '../utils/pod'; +import { Pod, IContainerStatus, IContainer } from 'kubernetes-models/v1'; import { DetectedError, ErrorMapper } from './types'; import { detectErrorsInObjects } from './common'; +import lodash from 'lodash'; +import { DateTime } from 'luxon'; -const podErrorMappers: ErrorMapper[] = [ +function isPodReadinessProbeUnready({ + container, + containerStatus, +}: ContainerSpecAndStatus): boolean { + if ( + containerStatus.ready || + containerStatus.state?.running?.startedAt === undefined || + !container.readinessProbe + ) { + return false; + } + const startDateTime = DateTime.fromISO( + containerStatus.state?.running?.startedAt, + ) + // Add initial delay + .plus({ + seconds: container.readinessProbe?.initialDelaySeconds ?? 0, + }) + // Add failure threshold + .plus({ + seconds: + (container.readinessProbe?.periodSeconds ?? 0) * + (container.readinessProbe?.failureThreshold ?? 0), + }); + return startDateTime < DateTime.now(); +} + +interface ContainerSpecAndStatus { + container: IContainer; + containerStatus: IContainerStatus; +} + +const podToContainerSpecsAndStatuses = (pod: Pod): ContainerSpecAndStatus[] => { + const specs = lodash.groupBy(pod.spec?.containers ?? [], value => value.name); + + const result: ContainerSpecAndStatus[] = []; + + for (const cs of pod.status?.containerStatuses ?? []) { + const spec = specs[cs.name]; + if (spec.length > 0) { + result.push({ + container: spec[0], + containerStatus: cs, + }); + } + } + + return result; +}; + +const podErrorMappers: ErrorMapper[] = [ { - severity: 5, - errorExplanation: 'status-message', - errorExists: pod => { - return pod.status?.message !== undefined; - }, - messageAccessor: pod => { - return [pod.status?.message ?? '']; + detectErrors: pod => { + return podToContainerSpecsAndStatuses(pod) + .filter(isPodReadinessProbeUnready) + .map(cs => ({ + type: 'readiness-probe-taking-too-long', + message: `The container ${cs.container.name} failed to start properly, but is not crashing`, + severity: 4, + proposedFix: [], // TODO next PR + sourceRef: { + name: pod.metadata?.name ?? 'unknown pod', + namespace: pod.metadata?.namespace ?? 'unknown namespace', + kind: 'Pod', + apiGroup: 'v1', + }, + occuranceCount: 1, + })); }, }, { - severity: 4, - errorExplanation: 'containers-restarting', - errorExists: pod => { - // TODO magic number - return totalRestarts(pod) > 3; - }, - messageAccessor: pod => { - return (pod.status?.containerStatuses ?? []) - .filter(cs => cs.restartCount > 0) - .map(cs => `container=${cs.name} restarted ${cs.restartCount} times`); - }, - }, - { - severity: 5, - errorExplanation: 'condition-message-present', - errorExists: pod => { - return (pod.status?.conditions ?? []).some(c => c.message !== undefined); - }, - messageAccessor: pod => { - return (pod.status?.conditions ?? []) - .filter(c => c.message !== undefined) - .map(c => c.message ?? ''); - }, - }, - { - severity: 6, - errorExplanation: 'container-waiting', - errorExists: pod => { - return (pod.status?.containerStatuses ?? []).some( - cs => cs.state?.waiting?.message !== undefined, - ); - }, - messageAccessor: pod => { + detectErrors: pod => { return (pod.status?.containerStatuses ?? []) .filter(cs => cs.state?.waiting?.message !== undefined) - .map(cs => cs.state?.waiting?.message ?? ''); + .map(cs => ({ + type: 'container-waiting', + message: cs.state?.waiting?.message ?? 'container waiting', + severity: 4, + proposedFix: [], // TODO next PR + sourceRef: { + name: pod.metadata?.name ?? 'unknown pod', + namespace: pod.metadata?.namespace ?? 'unknown namespace', + kind: 'Pod', + apiGroup: 'v1', + }, + occuranceCount: 1, + })); }, }, { - severity: 4, - errorExplanation: 'container-last-state-error', - errorExists: pod => { - return (pod.status?.containerStatuses ?? []).some( - cs => (cs.lastState?.terminated?.reason ?? '') === 'Error', - ); - }, - messageAccessor: pod => { + detectErrors: pod => { return (pod.status?.containerStatuses ?? []) - .filter(cs => (cs.lastState?.terminated?.reason ?? '') === 'Error') - .map( - cs => - `container=${cs.name} exited with error code (${cs.lastState?.terminated?.exitCode})`, - ); + .filter(cs => cs.restartCount > 0) + .map(cs => ({ + type: 'containers-restarting', + message: `container=${cs.name} restarted ${cs.restartCount} times`, + severity: 4, + proposedFix: [], // TODO next PR + sourceRef: { + name: pod.metadata?.name ?? 'unknown pod', + namespace: pod.metadata?.namespace ?? 'unknown namespace', + kind: 'Pod', + apiGroup: 'v1', + }, + occuranceCount: cs.restartCount, + })); }, }, ]; -export const detectErrorsInPods = ( - pods: V1Pod[], - clusterName: string, -): DetectedError[] => - detectErrorsInObjects(pods, 'Pod', clusterName, podErrorMappers); +export const detectErrorsInPods = (pods: Pod[]): DetectedError[] => + detectErrorsInObjects(pods, podErrorMappers); diff --git a/plugins/kubernetes/src/error-detection/types.ts b/plugins/kubernetes/src/error-detection/types.ts index 4bc7ae164d..3698d9bc68 100644 --- a/plugins/kubernetes/src/error-detection/types.ts +++ b/plugins/kubernetes/src/error-detection/types.ts @@ -14,13 +14,6 @@ * limitations under the License. */ -// Higher is more sever, but it's relative -import { - V1Deployment, - V1HorizontalPodAutoscaler, - V1Pod, -} from '@kubernetes/client-node'; - /** * Severity of the error, where 10 is critical and 0 is very low. * @@ -28,18 +21,6 @@ import { */ export type ErrorSeverity = 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10; -export type ErrorDetectable = V1Pod | V1Deployment | V1HorizontalPodAutoscaler; - -/** - * Kubernetes kinds that errors might be reported by the plugin - * - * @public - */ -export type ErrorDetectableKind = - | 'Pod' - | 'Deployment' - | 'HorizontalPodAutoscaler'; - /** * A list of errors keyed by Cluster name * @@ -47,23 +28,51 @@ export type ErrorDetectableKind = */ export type DetectedErrorsByCluster = Map; +export interface ResourceRef { + name: string; + namespace: string; + kind: string; + apiGroup: string; +} + /** * Represents an error found on a Kubernetes object * * @public */ export interface DetectedError { + type: string; severity: ErrorSeverity; - cluster: string; - namespace: string; - kind: ErrorDetectableKind; - names: string[]; - message: string[]; + message: string; + proposedFix: ProposedFix[]; + sourceRef: ResourceRef; + occuranceCount: number; } -export interface ErrorMapper { - severity: ErrorSeverity; - errorExplanation: string; - errorExists: (object: T) => boolean; - messageAccessor: (object: T) => string[]; +type ProposedFix = LogSolution | DocsSolution | EventsSolution; + +interface ProposedFixBase { + errorType: string; + rootCauseExplanation: string; + possibleFixes: string[]; +} + +export interface LogSolution extends ProposedFixBase { + type: 'logs'; + container: string; +} + +export interface DocsSolution extends ProposedFixBase { + type: 'docs'; + docsLink: string; +} + +export interface EventsSolution extends ProposedFixBase { + type: 'events'; + docsLink: string; + podName: string; +} + +export interface ErrorMapper { + detectErrors: (resource: T) => DetectedError[]; } diff --git a/yarn.lock b/yarn.lock index b893d378de..ed252fd849 100644 --- a/yarn.lock +++ b/yarn.lock @@ -7173,6 +7173,7 @@ __metadata: cronstrue: ^2.2.0 cross-fetch: ^3.1.5 js-yaml: ^4.0.0 + kubernetes-models: ^4.1.0 lodash: ^4.17.21 luxon: ^3.0.0 msw: ^1.0.0 @@ -11926,6 +11927,39 @@ __metadata: languageName: node linkType: hard +"@kubernetes-models/apimachinery@npm:^1.1.0": + version: 1.1.0 + resolution: "@kubernetes-models/apimachinery@npm:1.1.0" + dependencies: + "@kubernetes-models/base": ^4.0.0 + "@kubernetes-models/validate": ^3.0.0 + tslib: ^2.4.0 + checksum: 71fc127a8e50c3686e32d9f9df94a307ae4d69995d6e82ea50a7eb9a5ce6a18e9950700ad0eb445e1451c997aabb22e2594ab1037c11faa9c0051b51bab569c6 + languageName: node + linkType: hard + +"@kubernetes-models/base@npm:^4.0.0": + version: 4.0.0 + resolution: "@kubernetes-models/base@npm:4.0.0" + dependencies: + "@kubernetes-models/validate": ^3.0.0 + is-plain-object: ^5.0.0 + tslib: ^2.4.0 + checksum: 3c44f2220e119c24b874d6d674f38f9796f0ab682ba0cff1516d39549a6536055348bcee6c3c5257ebb8d4d6c8d1865d84e519d8f8948e6dd29eb1d72f3830cb + languageName: node + linkType: hard + +"@kubernetes-models/validate@npm:^3.0.0": + version: 3.0.0 + resolution: "@kubernetes-models/validate@npm:3.0.0" + dependencies: + ajv: ^8.11.0 + ajv-formats: ^2.1.1 + tslib: ^2.4.0 + checksum: b53b7181ddaad7e8faf6ceac228cb23df239ef5da3784887f3378dd3a7f89e1936b7b1884ab123255a905627311bab4302a06668fba35d26152e69fbf208a855 + languageName: node + linkType: hard + "@kubernetes/client-node@npm:0.18.1": version: 0.18.1 resolution: "@kubernetes/client-node@npm:0.18.1" @@ -17602,7 +17636,7 @@ __metadata: languageName: node linkType: hard -"ajv@npm:^8.0.0, ajv@npm:^8.10.0, ajv@npm:^8.12.0, ajv@npm:^8.8.0": +"ajv@npm:^8.0.0, ajv@npm:^8.10.0, ajv@npm:^8.11.0, ajv@npm:^8.12.0, ajv@npm:^8.8.0": version: 8.12.0 resolution: "ajv@npm:8.12.0" dependencies: @@ -28790,6 +28824,18 @@ __metadata: languageName: node linkType: hard +"kubernetes-models@npm:^4.1.0": + version: 4.1.0 + resolution: "kubernetes-models@npm:4.1.0" + dependencies: + "@kubernetes-models/apimachinery": ^1.1.0 + "@kubernetes-models/base": ^4.0.0 + "@kubernetes-models/validate": ^3.0.0 + tslib: ^2.4.0 + checksum: 1c3195a78ca422a6436f9c4143288a4d29f53b75ff597d72f44cc7f274714daab40a5bb99dd13e02edf3ce11a41f1df34d3d0bb1930b6401cc21c44666ad2b6b + languageName: node + linkType: hard + "kuler@npm:^2.0.0": version: 2.0.0 resolution: "kuler@npm:2.0.0"