refactor: kubernetes error detection (#17135)

* feat: refactor kubenetes error detection to make way for proposed solutions

Signed-off-by: Matthew Clarke <mclarke@spotify.com>

* docs: api-docs

Signed-off-by: Matthew Clarke <mclarke@spotify.com>

* chore: changeset

Signed-off-by: Matthew Clarke <mclarke@spotify.com>

* refactor: use luxon

Signed-off-by: Matthew Clarke <mclarke@spotify.com>

---------

Signed-off-by: Matthew Clarke <mclarke@spotify.com>
This commit is contained in:
Matthew Clarke
2023-04-05 04:27:00 -04:00
committed by GitHub
parent 4bea0f1d1b
commit 754be7c510
15 changed files with 504 additions and 318 deletions
+7
View File
@@ -0,0 +1,7 @@
---
'@backstage/plugin-kubernetes': minor
---
refactor kubernetes error detection to make way for proposed solutions
**BREAKING**: `DetectedError` now appears once per Kubernetes resource per error instead of for all resources which have that error, `namespace` and `name` fields are now in `sourceRef` object `message` is now a `string` instead of a `string[]`. `ErrorDetectableKind` has been removed.
+11 -13
View File
@@ -104,17 +104,21 @@ export interface DeploymentResources {
// @public
export interface DetectedError {
// (undocumented)
cluster: string;
message: string;
// (undocumented)
kind: ErrorDetectableKind;
occuranceCount: number;
// Warning: (ae-forgotten-export) The symbol "ProposedFix" needs to be exported by the entry point index.d.ts
//
// (undocumented)
message: string[];
// (undocumented)
names: string[];
// (undocumented)
namespace: string;
proposedFix: ProposedFix[];
// (undocumented)
severity: ErrorSeverity;
// Warning: (ae-forgotten-export) The symbol "ResourceRef" needs to be exported by the entry point index.d.ts
//
// (undocumented)
sourceRef: ResourceRef;
// (undocumented)
type: string;
}
// @public
@@ -137,12 +141,6 @@ export type EntityKubernetesContentProps = {
refreshIntervalMs?: number;
};
// @public
export type ErrorDetectableKind =
| 'Pod'
| 'Deployment'
| 'HorizontalPodAutoscaler';
// Warning: (ae-forgotten-export) The symbol "ErrorPanelProps" needs to be exported by the entry point index.d.ts
// Warning: (ae-missing-release-tag) "ErrorPanel" is part of the package's API, but it is missing a release tag (@alpha, @beta, @public, or @internal)
//
+1
View File
@@ -47,6 +47,7 @@
"@types/react": "^16.13.1 || ^17.0.0",
"cronstrue": "^2.2.0",
"js-yaml": "^4.0.0",
"kubernetes-models": "^4.1.0",
"lodash": "^4.17.21",
"luxon": "^3.0.0",
"react-use": "^17.2.4"
@@ -15,84 +15,64 @@
*/
import * as React from 'react';
import { DetectedError, DetectedErrorsByCluster } from '../../error-detection';
import { Chip } from '@material-ui/core';
import { Table, TableColumn } from '@backstage/core-components';
type ErrorReportingProps = {
detectedErrors: DetectedErrorsByCluster;
};
const columns: TableColumn<DetectedError>[] = [
const columns: TableColumn<Row>[] = [
{
title: 'cluster',
width: '10%',
render: (detectedError: DetectedError) => detectedError.cluster,
render: (row: Row) => row.clusterName,
},
{
title: 'namespace',
width: '10%',
render: (detectedError: DetectedError) => detectedError.namespace,
render: (row: Row) => row.error.sourceRef.namespace,
},
{
title: 'kind',
width: '10%',
render: (detectedError: DetectedError) => detectedError.kind,
render: (row: Row) => row.error.sourceRef.kind,
},
{
title: 'name',
width: '30%',
render: (detectedError: DetectedError) => {
const errorCount = detectedError.names.length;
if (errorCount === 0) {
// This shouldn't happen
return null;
}
const displayName = detectedError.names[0];
const otherErrorCount = errorCount - 1;
return (
<>
{displayName}{' '}
{otherErrorCount > 0 && (
<Chip
label={`+ ${otherErrorCount} other${
otherErrorCount > 1 ? 's' : ''
}`}
size="small"
/>
)}
</>
);
render: (row: Row) => {
return <>{row.error.sourceRef.name} </>;
},
},
{
title: 'messages',
width: '40%',
render: (detectedError: DetectedError) => (
<>
{detectedError.message.map((m, i) => (
<div key={i}>{m}</div>
))}
</>
),
render: (row: Row) => row.error.message,
},
];
const sortBySeverity = (a: DetectedError, b: DetectedError) => {
if (a.severity < b.severity) {
interface Row {
clusterName: string;
error: DetectedError;
}
const sortBySeverity = (a: Row, b: Row) => {
if (a.error.severity < b.error.severity) {
return 1;
} else if (b.severity < a.severity) {
} else if (b.error.severity < a.error.severity) {
return -1;
}
return 0;
};
export const ErrorReporting = ({ detectedErrors }: ErrorReportingProps) => {
const errors = Array.from(detectedErrors.values())
.flat()
const errors = Array.from(detectedErrors.entries())
.flatMap(([clusterName, resourceErrors]) => {
return resourceErrors.map(e => ({
clusterName,
error: e,
}));
})
.sort(sortBySeverity);
return (
@@ -114,9 +114,8 @@ export const KubernetesContent = ({
const podsWithErrors = new Set<string>(
detectedErrors
.get(item.cluster.name)
?.filter(de => de.kind === 'Pod')
.map(de => de.names)
.flat() ?? [],
?.filter(de => de.sourceRef.kind === 'Pod')
.map(de => de.sourceRef.name),
);
return (
@@ -185,7 +185,7 @@
},
"name": "other-side-car",
"ready": false,
"restartCount": 38,
"restartCount": 123,
"started": false,
"state": {
"waiting": {
@@ -14,59 +14,15 @@
* limitations under the License.
*/
import {
DetectedError,
ErrorDetectable,
ErrorDetectableKind,
ErrorMapper,
} from './types';
import { DetectedError, ErrorMapper } from './types';
// Run through the each error mapper for each object
// returning a deduplicated (mostly) result
export const detectErrorsInObjects = <T extends ErrorDetectable>(
export const detectErrorsInObjects = <T>(
objects: T[],
kind: ErrorDetectableKind,
clusterName: string,
errorMappers: ErrorMapper<T>[],
): DetectedError[] => {
// Build up a map of errors
// key: the joined message produced by an error
// value: the error
const errors = new Map<string, DetectedError>();
for (const object of objects) {
for (const errorMapper of errorMappers) {
if (errorMapper.errorExists(object)) {
const message = errorMapper.messageAccessor(object);
// TODO This is not perfect as errors with uuid/hashes/date/times will not be caught by this
const dedupKey = message.join('');
const value = errors.get(dedupKey);
const name = object.metadata?.name ?? 'unknown';
const namespace = object.metadata?.namespace ?? 'unknown';
if (value !== undefined) {
// This gets translated into the Chip "+5 others"
// in the ErrorReporting component
// but we need to keep the names so we can easily
// find which objects owns the error later
value.names.push(name);
errors.set(dedupKey, value);
} else {
errors.set(dedupKey, {
cluster: clusterName,
kind: kind,
names: [name],
message: message,
severity: errorMapper.severity,
namespace,
});
}
}
}
}
return Array.from(errors.values());
return objects.flatMap(o => {
return errorMappers.flatMap(em => em.detectErrors(o));
});
};
@@ -15,35 +15,33 @@
*/
import { DetectedError, ErrorMapper } from './types';
import { V1Deployment } from '@kubernetes/client-node';
import { Deployment } from 'kubernetes-models/apps/v1';
import { detectErrorsInObjects } from './common';
const deploymentErrorMappers: ErrorMapper<V1Deployment>[] = [
const deploymentErrorMappers: ErrorMapper<Deployment>[] = [
{
// this is probably important
severity: 6,
errorExplanation: 'condition-message-present',
errorExists: deployment => {
return (deployment.status?.conditions ?? [])
.filter(c => c.status === 'False')
.some(c => c.message !== undefined);
},
messageAccessor: deployment => {
detectErrors: deployment => {
return (deployment.status?.conditions ?? [])
.filter(c => c.status === 'False')
.filter(c => c.message !== undefined)
.map(c => c.message ?? '');
.map(c => ({
type: 'condition-message-present',
message: c.message ?? '',
severity: 6,
proposedFix: [], // TODO next PR
sourceRef: {
name: deployment.metadata?.name ?? 'unknown hpa',
namespace: deployment.metadata?.namespace ?? 'unknown namespace',
kind: 'Deployment',
apiGroup: 'apps/v1',
},
occuranceCount: 1,
}));
},
},
];
export const detectErrorsInDeployments = (
deployments: V1Deployment[],
clusterName: string,
deployments: Deployment[],
): DetectedError[] =>
detectErrorsInObjects(
deployments,
'Deployment',
clusterName,
deploymentErrorMappers,
);
detectErrorsInObjects(deployments, deploymentErrorMappers);
@@ -147,51 +147,61 @@ describe('detectErrors', () => {
const [err1, err2, err3, err4] = errors ?? [];
expect(err1).toStrictEqual({
cluster: 'cluster-a',
kind: 'Pod',
message: [
'container=other-side-car restarted 38 times',
'container=side-car restarted 38 times',
],
names: ['dice-roller-canary-7d64cd756c-55rfq'],
namespace: 'default',
sourceRef: {
apiGroup: 'v1',
kind: 'Pod',
name: 'dice-roller-canary-7d64cd756c-55rfq',
namespace: 'default',
},
message:
'back-off 5m0s restarting failed container=other-side-car pod=dice-roller-canary-7d64cd756c-55rfq_default(65ad28e3-5d51-4b4b-9bf8-4cb069803034)',
severity: 4,
occuranceCount: 1,
proposedFix: [],
type: 'container-waiting',
});
expect(err2).toStrictEqual({
cluster: 'cluster-a',
kind: 'Pod',
message: [
'containers with unready status: [side-car other-side-car]',
'containers with unready status: [side-car other-side-car]',
],
names: ['dice-roller-canary-7d64cd756c-55rfq'],
namespace: 'default',
severity: 5,
sourceRef: {
apiGroup: 'v1',
kind: 'Pod',
name: 'dice-roller-canary-7d64cd756c-55rfq',
namespace: 'default',
},
message:
'back-off 5m0s restarting failed container=side-car pod=dice-roller-canary-7d64cd756c-55rfq_default(65ad28e3-5d51-4b4b-9bf8-4cb069803034)',
severity: 4,
occuranceCount: 1,
proposedFix: [],
type: 'container-waiting',
});
expect(err3).toStrictEqual({
cluster: 'cluster-a',
kind: 'Pod',
message: [
'back-off 5m0s restarting failed container=other-side-car pod=dice-roller-canary-7d64cd756c-55rfq_default(65ad28e3-5d51-4b4b-9bf8-4cb069803034)',
'back-off 5m0s restarting failed container=side-car pod=dice-roller-canary-7d64cd756c-55rfq_default(65ad28e3-5d51-4b4b-9bf8-4cb069803034)',
],
names: ['dice-roller-canary-7d64cd756c-55rfq'],
namespace: 'default',
severity: 6,
sourceRef: {
apiGroup: 'v1',
kind: 'Pod',
name: 'dice-roller-canary-7d64cd756c-55rfq',
namespace: 'default',
},
message: 'container=other-side-car restarted 123 times',
severity: 4,
occuranceCount: 123,
proposedFix: [],
type: 'containers-restarting',
});
expect(err4).toStrictEqual({
cluster: 'cluster-a',
kind: 'Pod',
message: [
'container=other-side-car exited with error code (1)',
'container=side-car exited with error code (1)',
],
names: ['dice-roller-canary-7d64cd756c-55rfq'],
namespace: 'default',
sourceRef: {
apiGroup: 'v1',
kind: 'Pod',
name: 'dice-roller-canary-7d64cd756c-55rfq',
namespace: 'default',
},
message: 'container=side-car restarted 38 times',
severity: 4,
occuranceCount: 38,
proposedFix: [],
type: 'containers-restarting',
});
});
it('should detect errors in pod with missing Config Map', () => {
@@ -202,29 +212,22 @@ describe('detectErrors', () => {
const errors = result.get(CLUSTER_NAME);
expect(errors).toBeDefined();
expect(errors).toHaveLength(2);
expect(errors).toHaveLength(1);
const [err1, err2] = errors ?? [];
const [err1] = errors ?? [];
expect(err1).toStrictEqual({
cluster: 'cluster-a',
kind: 'Pod',
message: [
'containers with unready status: [nginx]',
'containers with unready status: [nginx]',
],
names: ['dice-roller-bad-cm-855bf85464-mg6xb'],
namespace: 'default',
severity: 5,
});
expect(err2).toStrictEqual({
cluster: 'cluster-a',
kind: 'Pod',
message: ['configmap "some-cm" not found'],
names: ['dice-roller-bad-cm-855bf85464-mg6xb'],
namespace: 'default',
severity: 6,
message: 'configmap "some-cm" not found',
occuranceCount: 1,
proposedFix: [],
severity: 4,
sourceRef: {
apiGroup: 'v1',
kind: 'Pod',
name: 'dice-roller-bad-cm-855bf85464-mg6xb',
namespace: 'default',
},
type: 'container-waiting',
});
});
it('should detect no errors in healthy deployment', () => {
@@ -250,12 +253,17 @@ describe('detectErrors', () => {
const [err1] = errors ?? [];
expect(err1).toStrictEqual({
cluster: 'cluster-a',
kind: 'Deployment',
message: ['Deployment does not have minimum availability.'],
names: ['dice-roller-canary'],
namespace: 'default',
sourceRef: {
apiGroup: 'apps/v1',
kind: 'Deployment',
name: 'dice-roller-canary',
namespace: 'default',
},
message: 'Deployment does not have minimum availability.',
severity: 6,
occuranceCount: 1,
proposedFix: [],
type: 'condition-message-present',
});
});
it('should detect no errors in healthy hpa', () => {
@@ -281,14 +289,159 @@ describe('detectErrors', () => {
const [err1] = errors ?? [];
expect(err1).toStrictEqual({
cluster: 'cluster-a',
kind: 'HorizontalPodAutoscaler',
message: [
sourceRef: {
apiGroup: 'autoscaling/v1',
kind: 'HorizontalPodAutoscaler',
name: 'dice-roller',
namespace: 'default',
},
message:
'Current number of replicas (10) is equal to the configured max number of replicas (10)',
],
names: ['dice-roller'],
namespace: 'default',
severity: 8,
occuranceCount: 1,
proposedFix: [],
type: 'hpa-max-current-replicas',
});
});
it('pending pod is not an error', async () => {
const expiredReadiness = new Date();
expiredReadiness.setFullYear(expiredReadiness.getFullYear() - 1);
const result = await detectErrors(
onePod({
spec: {
containers: [
{
name: 'some-container',
readinessProbe: {
initialDelaySeconds: 20000,
failureThreshold: 5,
periodSeconds: 5,
},
},
],
},
status: {
containerStatuses: [
{
name: 'some-container',
image: 'some-image',
imageID: 'some-image-id',
restartCount: 0,
containerID: 'running-container',
ready: false,
state: {
running: {
startedAt: new Date().toISOString() as any,
},
},
},
],
message: 'Container running',
},
}),
);
const errors = result.get(CLUSTER_NAME);
expect(errors).toBeDefined();
expect(errors).toHaveLength(0);
});
it('no probe pod has no errors', async () => {
const expiredReadiness = new Date();
expiredReadiness.setFullYear(expiredReadiness.getFullYear() - 1);
const result = await detectErrors(
onePod({
spec: {
containers: [
{
name: 'some-container',
},
],
},
status: {
containerStatuses: [
{
name: 'some-container',
image: 'some-image',
imageID: 'some-image-id',
restartCount: 0,
containerID: 'running-container',
ready: false,
state: {
running: {
startedAt: new Date().toISOString() as any,
},
},
},
],
message: 'Container running',
},
}),
);
const errors = result.get(CLUSTER_NAME);
expect(errors).toBeDefined();
expect(errors).toHaveLength(0);
});
it('readiness probe failure results in error', async () => {
const expiredReadiness = new Date();
expiredReadiness.setFullYear(expiredReadiness.getFullYear() - 1);
const result = await detectErrors(
onePod({
spec: {
containers: [
{
name: 'some-container',
readinessProbe: {
initialDelaySeconds: 20,
failureThreshold: 5,
periodSeconds: 5,
},
},
],
},
status: {
containerStatuses: [
{
name: 'some-container',
image: 'some-image',
imageID: 'some-image-id',
restartCount: 0,
containerID: 'running-container',
ready: false,
state: {
running: {
startedAt: expiredReadiness.toISOString() as any,
},
},
},
],
message: 'Container running',
},
}),
);
const errors = result.get(CLUSTER_NAME);
expect(errors).toBeDefined();
expect(errors).toHaveLength(1);
const [err1] = errors ?? [];
expect(err1).toStrictEqual({
message:
'The container some-container failed to start properly, but is not crashing',
occuranceCount: 1,
proposedFix: [],
severity: 4,
sourceRef: {
apiGroup: 'v1',
kind: 'Pod',
name: 'unknown pod',
namespace: 'unknown namespace',
},
type: 'readiness-probe-taking-too-long',
});
});
});
@@ -20,6 +20,9 @@ import { groupResponses } from '../utils/response';
import { detectErrorsInPods } from './pods';
import { detectErrorsInDeployments } from './deployments';
import { detectErrorsInHpa } from './hpas';
import { Deployment } from 'kubernetes-models/apps/v1';
import { HorizontalPodAutoscaler } from 'kubernetes-models/autoscaling/v1';
import { Pod } from 'kubernetes-models/v1';
/**
* For each cluster try to find errors in each of the object types provided
@@ -38,20 +41,16 @@ export const detectErrors = (
const groupedResponses = groupResponses(clusterResponse.resources);
clusterErrors = clusterErrors.concat(
detectErrorsInPods(groupedResponses.pods, clusterResponse.cluster.name),
detectErrorsInPods(groupedResponses.pods as Pod[]),
);
clusterErrors = clusterErrors.concat(
detectErrorsInDeployments(
groupedResponses.deployments,
clusterResponse.cluster.name,
),
detectErrorsInDeployments(groupedResponses.deployments as Deployment[]),
);
clusterErrors = clusterErrors.concat(
detectErrorsInHpa(
groupedResponses.horizontalPodAutoscalers,
clusterResponse.cluster.name,
groupedResponses.horizontalPodAutoscalers as HorizontalPodAutoscaler[],
),
);
+27 -25
View File
@@ -14,37 +14,39 @@
* limitations under the License.
*/
import { V1HorizontalPodAutoscaler } from '@kubernetes/client-node';
import { HorizontalPodAutoscaler } from 'kubernetes-models/autoscaling/v1';
import { DetectedError, ErrorMapper } from './types';
import { detectErrorsInObjects } from './common';
const hpaErrorMappers: ErrorMapper<V1HorizontalPodAutoscaler>[] = [
const hpaErrorMappers: ErrorMapper<HorizontalPodAutoscaler>[] = [
{
// this is probably important
severity: 8,
errorExplanation: 'hpa-max-current-replicas',
errorExists: hpa => {
return (hpa.spec?.maxReplicas ?? -1) === hpa.status?.currentReplicas;
},
messageAccessor: hpa => {
return [
`Current number of replicas (${
hpa.status?.currentReplicas
}) is equal to the configured max number of replicas (${
hpa.spec?.maxReplicas ?? -1
})`,
];
detectErrors: hpa => {
if ((hpa.spec?.maxReplicas ?? -1) === hpa.status?.currentReplicas) {
return [
{
type: 'hpa-max-current-replicas',
message: `Current number of replicas (${
hpa.status?.currentReplicas
}) is equal to the configured max number of replicas (${
hpa.spec?.maxReplicas ?? -1
})`,
severity: 8,
proposedFix: [], // TODO next PR
sourceRef: {
name: hpa.metadata?.name ?? 'unknown hpa',
namespace: hpa.metadata?.namespace ?? 'unknown namespace',
kind: 'HorizontalPodAutoscaler',
apiGroup: 'autoscaling/v1',
},
occuranceCount: 1,
},
];
}
return [];
},
},
];
export const detectErrorsInHpa = (
hpas: V1HorizontalPodAutoscaler[],
clusterName: string,
): DetectedError[] =>
detectErrorsInObjects(
hpas,
'HorizontalPodAutoscaler',
clusterName,
hpaErrorMappers,
);
hpas: HorizontalPodAutoscaler[],
): DetectedError[] => detectErrorsInObjects(hpas, hpaErrorMappers);
@@ -15,7 +15,6 @@
*/
export type {
ErrorDetectableKind,
DetectedError,
DetectedErrorsByCluster,
ErrorSeverity,
+101 -62
View File
@@ -14,82 +14,121 @@
* limitations under the License.
*/
import { V1Pod } from '@kubernetes/client-node';
import { totalRestarts } from '../utils/pod';
import { Pod, IContainerStatus, IContainer } from 'kubernetes-models/v1';
import { DetectedError, ErrorMapper } from './types';
import { detectErrorsInObjects } from './common';
import lodash from 'lodash';
import { DateTime } from 'luxon';
const podErrorMappers: ErrorMapper<V1Pod>[] = [
function isPodReadinessProbeUnready({
container,
containerStatus,
}: ContainerSpecAndStatus): boolean {
if (
containerStatus.ready ||
containerStatus.state?.running?.startedAt === undefined ||
!container.readinessProbe
) {
return false;
}
const startDateTime = DateTime.fromISO(
containerStatus.state?.running?.startedAt,
)
// Add initial delay
.plus({
seconds: container.readinessProbe?.initialDelaySeconds ?? 0,
})
// Add failure threshold
.plus({
seconds:
(container.readinessProbe?.periodSeconds ?? 0) *
(container.readinessProbe?.failureThreshold ?? 0),
});
return startDateTime < DateTime.now();
}
interface ContainerSpecAndStatus {
container: IContainer;
containerStatus: IContainerStatus;
}
const podToContainerSpecsAndStatuses = (pod: Pod): ContainerSpecAndStatus[] => {
const specs = lodash.groupBy(pod.spec?.containers ?? [], value => value.name);
const result: ContainerSpecAndStatus[] = [];
for (const cs of pod.status?.containerStatuses ?? []) {
const spec = specs[cs.name];
if (spec.length > 0) {
result.push({
container: spec[0],
containerStatus: cs,
});
}
}
return result;
};
const podErrorMappers: ErrorMapper<Pod>[] = [
{
severity: 5,
errorExplanation: 'status-message',
errorExists: pod => {
return pod.status?.message !== undefined;
},
messageAccessor: pod => {
return [pod.status?.message ?? ''];
detectErrors: pod => {
return podToContainerSpecsAndStatuses(pod)
.filter(isPodReadinessProbeUnready)
.map(cs => ({
type: 'readiness-probe-taking-too-long',
message: `The container ${cs.container.name} failed to start properly, but is not crashing`,
severity: 4,
proposedFix: [], // TODO next PR
sourceRef: {
name: pod.metadata?.name ?? 'unknown pod',
namespace: pod.metadata?.namespace ?? 'unknown namespace',
kind: 'Pod',
apiGroup: 'v1',
},
occuranceCount: 1,
}));
},
},
{
severity: 4,
errorExplanation: 'containers-restarting',
errorExists: pod => {
// TODO magic number
return totalRestarts(pod) > 3;
},
messageAccessor: pod => {
return (pod.status?.containerStatuses ?? [])
.filter(cs => cs.restartCount > 0)
.map(cs => `container=${cs.name} restarted ${cs.restartCount} times`);
},
},
{
severity: 5,
errorExplanation: 'condition-message-present',
errorExists: pod => {
return (pod.status?.conditions ?? []).some(c => c.message !== undefined);
},
messageAccessor: pod => {
return (pod.status?.conditions ?? [])
.filter(c => c.message !== undefined)
.map(c => c.message ?? '');
},
},
{
severity: 6,
errorExplanation: 'container-waiting',
errorExists: pod => {
return (pod.status?.containerStatuses ?? []).some(
cs => cs.state?.waiting?.message !== undefined,
);
},
messageAccessor: pod => {
detectErrors: pod => {
return (pod.status?.containerStatuses ?? [])
.filter(cs => cs.state?.waiting?.message !== undefined)
.map(cs => cs.state?.waiting?.message ?? '');
.map(cs => ({
type: 'container-waiting',
message: cs.state?.waiting?.message ?? 'container waiting',
severity: 4,
proposedFix: [], // TODO next PR
sourceRef: {
name: pod.metadata?.name ?? 'unknown pod',
namespace: pod.metadata?.namespace ?? 'unknown namespace',
kind: 'Pod',
apiGroup: 'v1',
},
occuranceCount: 1,
}));
},
},
{
severity: 4,
errorExplanation: 'container-last-state-error',
errorExists: pod => {
return (pod.status?.containerStatuses ?? []).some(
cs => (cs.lastState?.terminated?.reason ?? '') === 'Error',
);
},
messageAccessor: pod => {
detectErrors: pod => {
return (pod.status?.containerStatuses ?? [])
.filter(cs => (cs.lastState?.terminated?.reason ?? '') === 'Error')
.map(
cs =>
`container=${cs.name} exited with error code (${cs.lastState?.terminated?.exitCode})`,
);
.filter(cs => cs.restartCount > 0)
.map(cs => ({
type: 'containers-restarting',
message: `container=${cs.name} restarted ${cs.restartCount} times`,
severity: 4,
proposedFix: [], // TODO next PR
sourceRef: {
name: pod.metadata?.name ?? 'unknown pod',
namespace: pod.metadata?.namespace ?? 'unknown namespace',
kind: 'Pod',
apiGroup: 'v1',
},
occuranceCount: cs.restartCount,
}));
},
},
];
export const detectErrorsInPods = (
pods: V1Pod[],
clusterName: string,
): DetectedError[] =>
detectErrorsInObjects(pods, 'Pod', clusterName, podErrorMappers);
export const detectErrorsInPods = (pods: Pod[]): DetectedError[] =>
detectErrorsInObjects(pods, podErrorMappers);
+38 -29
View File
@@ -14,13 +14,6 @@
* limitations under the License.
*/
// Higher is more sever, but it's relative
import {
V1Deployment,
V1HorizontalPodAutoscaler,
V1Pod,
} from '@kubernetes/client-node';
/**
* Severity of the error, where 10 is critical and 0 is very low.
*
@@ -28,18 +21,6 @@ import {
*/
export type ErrorSeverity = 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10;
export type ErrorDetectable = V1Pod | V1Deployment | V1HorizontalPodAutoscaler;
/**
* Kubernetes kinds that errors might be reported by the plugin
*
* @public
*/
export type ErrorDetectableKind =
| 'Pod'
| 'Deployment'
| 'HorizontalPodAutoscaler';
/**
* A list of errors keyed by Cluster name
*
@@ -47,23 +28,51 @@ export type ErrorDetectableKind =
*/
export type DetectedErrorsByCluster = Map<string, DetectedError[]>;
export interface ResourceRef {
name: string;
namespace: string;
kind: string;
apiGroup: string;
}
/**
* Represents an error found on a Kubernetes object
*
* @public
*/
export interface DetectedError {
type: string;
severity: ErrorSeverity;
cluster: string;
namespace: string;
kind: ErrorDetectableKind;
names: string[];
message: string[];
message: string;
proposedFix: ProposedFix[];
sourceRef: ResourceRef;
occuranceCount: number;
}
export interface ErrorMapper<T extends ErrorDetectable> {
severity: ErrorSeverity;
errorExplanation: string;
errorExists: (object: T) => boolean;
messageAccessor: (object: T) => string[];
type ProposedFix = LogSolution | DocsSolution | EventsSolution;
interface ProposedFixBase {
errorType: string;
rootCauseExplanation: string;
possibleFixes: string[];
}
export interface LogSolution extends ProposedFixBase {
type: 'logs';
container: string;
}
export interface DocsSolution extends ProposedFixBase {
type: 'docs';
docsLink: string;
}
export interface EventsSolution extends ProposedFixBase {
type: 'events';
docsLink: string;
podName: string;
}
export interface ErrorMapper<T> {
detectErrors: (resource: T) => DetectedError[];
}
+47 -1
View File
@@ -7173,6 +7173,7 @@ __metadata:
cronstrue: ^2.2.0
cross-fetch: ^3.1.5
js-yaml: ^4.0.0
kubernetes-models: ^4.1.0
lodash: ^4.17.21
luxon: ^3.0.0
msw: ^1.0.0
@@ -11926,6 +11927,39 @@ __metadata:
languageName: node
linkType: hard
"@kubernetes-models/apimachinery@npm:^1.1.0":
version: 1.1.0
resolution: "@kubernetes-models/apimachinery@npm:1.1.0"
dependencies:
"@kubernetes-models/base": ^4.0.0
"@kubernetes-models/validate": ^3.0.0
tslib: ^2.4.0
checksum: 71fc127a8e50c3686e32d9f9df94a307ae4d69995d6e82ea50a7eb9a5ce6a18e9950700ad0eb445e1451c997aabb22e2594ab1037c11faa9c0051b51bab569c6
languageName: node
linkType: hard
"@kubernetes-models/base@npm:^4.0.0":
version: 4.0.0
resolution: "@kubernetes-models/base@npm:4.0.0"
dependencies:
"@kubernetes-models/validate": ^3.0.0
is-plain-object: ^5.0.0
tslib: ^2.4.0
checksum: 3c44f2220e119c24b874d6d674f38f9796f0ab682ba0cff1516d39549a6536055348bcee6c3c5257ebb8d4d6c8d1865d84e519d8f8948e6dd29eb1d72f3830cb
languageName: node
linkType: hard
"@kubernetes-models/validate@npm:^3.0.0":
version: 3.0.0
resolution: "@kubernetes-models/validate@npm:3.0.0"
dependencies:
ajv: ^8.11.0
ajv-formats: ^2.1.1
tslib: ^2.4.0
checksum: b53b7181ddaad7e8faf6ceac228cb23df239ef5da3784887f3378dd3a7f89e1936b7b1884ab123255a905627311bab4302a06668fba35d26152e69fbf208a855
languageName: node
linkType: hard
"@kubernetes/client-node@npm:0.18.1":
version: 0.18.1
resolution: "@kubernetes/client-node@npm:0.18.1"
@@ -17602,7 +17636,7 @@ __metadata:
languageName: node
linkType: hard
"ajv@npm:^8.0.0, ajv@npm:^8.10.0, ajv@npm:^8.12.0, ajv@npm:^8.8.0":
"ajv@npm:^8.0.0, ajv@npm:^8.10.0, ajv@npm:^8.11.0, ajv@npm:^8.12.0, ajv@npm:^8.8.0":
version: 8.12.0
resolution: "ajv@npm:8.12.0"
dependencies:
@@ -28790,6 +28824,18 @@ __metadata:
languageName: node
linkType: hard
"kubernetes-models@npm:^4.1.0":
version: 4.1.0
resolution: "kubernetes-models@npm:4.1.0"
dependencies:
"@kubernetes-models/apimachinery": ^1.1.0
"@kubernetes-models/base": ^4.0.0
"@kubernetes-models/validate": ^3.0.0
tslib: ^2.4.0
checksum: 1c3195a78ca422a6436f9c4143288a4d29f53b75ff597d72f44cc7f274714daab40a5bb99dd13e02edf3ce11a41f1df34d3d0bb1930b6401cc21c44666ad2b6b
languageName: node
linkType: hard
"kuler@npm:^2.0.0":
version: 2.0.0
resolution: "kuler@npm:2.0.0"