refactor: kubernetes error detection (#17135)
* feat: refactor kubenetes error detection to make way for proposed solutions Signed-off-by: Matthew Clarke <mclarke@spotify.com> * docs: api-docs Signed-off-by: Matthew Clarke <mclarke@spotify.com> * chore: changeset Signed-off-by: Matthew Clarke <mclarke@spotify.com> * refactor: use luxon Signed-off-by: Matthew Clarke <mclarke@spotify.com> --------- Signed-off-by: Matthew Clarke <mclarke@spotify.com>
This commit is contained in:
@@ -0,0 +1,7 @@
|
||||
---
|
||||
'@backstage/plugin-kubernetes': minor
|
||||
---
|
||||
|
||||
refactor kubernetes error detection to make way for proposed solutions
|
||||
|
||||
**BREAKING**: `DetectedError` now appears once per Kubernetes resource per error instead of for all resources which have that error, `namespace` and `name` fields are now in `sourceRef` object `message` is now a `string` instead of a `string[]`. `ErrorDetectableKind` has been removed.
|
||||
@@ -104,17 +104,21 @@ export interface DeploymentResources {
|
||||
// @public
|
||||
export interface DetectedError {
|
||||
// (undocumented)
|
||||
cluster: string;
|
||||
message: string;
|
||||
// (undocumented)
|
||||
kind: ErrorDetectableKind;
|
||||
occuranceCount: number;
|
||||
// Warning: (ae-forgotten-export) The symbol "ProposedFix" needs to be exported by the entry point index.d.ts
|
||||
//
|
||||
// (undocumented)
|
||||
message: string[];
|
||||
// (undocumented)
|
||||
names: string[];
|
||||
// (undocumented)
|
||||
namespace: string;
|
||||
proposedFix: ProposedFix[];
|
||||
// (undocumented)
|
||||
severity: ErrorSeverity;
|
||||
// Warning: (ae-forgotten-export) The symbol "ResourceRef" needs to be exported by the entry point index.d.ts
|
||||
//
|
||||
// (undocumented)
|
||||
sourceRef: ResourceRef;
|
||||
// (undocumented)
|
||||
type: string;
|
||||
}
|
||||
|
||||
// @public
|
||||
@@ -137,12 +141,6 @@ export type EntityKubernetesContentProps = {
|
||||
refreshIntervalMs?: number;
|
||||
};
|
||||
|
||||
// @public
|
||||
export type ErrorDetectableKind =
|
||||
| 'Pod'
|
||||
| 'Deployment'
|
||||
| 'HorizontalPodAutoscaler';
|
||||
|
||||
// Warning: (ae-forgotten-export) The symbol "ErrorPanelProps" needs to be exported by the entry point index.d.ts
|
||||
// Warning: (ae-missing-release-tag) "ErrorPanel" is part of the package's API, but it is missing a release tag (@alpha, @beta, @public, or @internal)
|
||||
//
|
||||
|
||||
@@ -47,6 +47,7 @@
|
||||
"@types/react": "^16.13.1 || ^17.0.0",
|
||||
"cronstrue": "^2.2.0",
|
||||
"js-yaml": "^4.0.0",
|
||||
"kubernetes-models": "^4.1.0",
|
||||
"lodash": "^4.17.21",
|
||||
"luxon": "^3.0.0",
|
||||
"react-use": "^17.2.4"
|
||||
|
||||
@@ -15,84 +15,64 @@
|
||||
*/
|
||||
import * as React from 'react';
|
||||
import { DetectedError, DetectedErrorsByCluster } from '../../error-detection';
|
||||
import { Chip } from '@material-ui/core';
|
||||
import { Table, TableColumn } from '@backstage/core-components';
|
||||
|
||||
type ErrorReportingProps = {
|
||||
detectedErrors: DetectedErrorsByCluster;
|
||||
};
|
||||
|
||||
const columns: TableColumn<DetectedError>[] = [
|
||||
const columns: TableColumn<Row>[] = [
|
||||
{
|
||||
title: 'cluster',
|
||||
width: '10%',
|
||||
render: (detectedError: DetectedError) => detectedError.cluster,
|
||||
render: (row: Row) => row.clusterName,
|
||||
},
|
||||
{
|
||||
title: 'namespace',
|
||||
width: '10%',
|
||||
render: (detectedError: DetectedError) => detectedError.namespace,
|
||||
render: (row: Row) => row.error.sourceRef.namespace,
|
||||
},
|
||||
{
|
||||
title: 'kind',
|
||||
width: '10%',
|
||||
render: (detectedError: DetectedError) => detectedError.kind,
|
||||
render: (row: Row) => row.error.sourceRef.kind,
|
||||
},
|
||||
{
|
||||
title: 'name',
|
||||
width: '30%',
|
||||
render: (detectedError: DetectedError) => {
|
||||
const errorCount = detectedError.names.length;
|
||||
|
||||
if (errorCount === 0) {
|
||||
// This shouldn't happen
|
||||
return null;
|
||||
}
|
||||
|
||||
const displayName = detectedError.names[0];
|
||||
|
||||
const otherErrorCount = errorCount - 1;
|
||||
|
||||
return (
|
||||
<>
|
||||
{displayName}{' '}
|
||||
{otherErrorCount > 0 && (
|
||||
<Chip
|
||||
label={`+ ${otherErrorCount} other${
|
||||
otherErrorCount > 1 ? 's' : ''
|
||||
}`}
|
||||
size="small"
|
||||
/>
|
||||
)}
|
||||
</>
|
||||
);
|
||||
render: (row: Row) => {
|
||||
return <>{row.error.sourceRef.name} </>;
|
||||
},
|
||||
},
|
||||
{
|
||||
title: 'messages',
|
||||
width: '40%',
|
||||
render: (detectedError: DetectedError) => (
|
||||
<>
|
||||
{detectedError.message.map((m, i) => (
|
||||
<div key={i}>{m}</div>
|
||||
))}
|
||||
</>
|
||||
),
|
||||
render: (row: Row) => row.error.message,
|
||||
},
|
||||
];
|
||||
|
||||
const sortBySeverity = (a: DetectedError, b: DetectedError) => {
|
||||
if (a.severity < b.severity) {
|
||||
interface Row {
|
||||
clusterName: string;
|
||||
error: DetectedError;
|
||||
}
|
||||
|
||||
const sortBySeverity = (a: Row, b: Row) => {
|
||||
if (a.error.severity < b.error.severity) {
|
||||
return 1;
|
||||
} else if (b.severity < a.severity) {
|
||||
} else if (b.error.severity < a.error.severity) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
|
||||
export const ErrorReporting = ({ detectedErrors }: ErrorReportingProps) => {
|
||||
const errors = Array.from(detectedErrors.values())
|
||||
.flat()
|
||||
const errors = Array.from(detectedErrors.entries())
|
||||
.flatMap(([clusterName, resourceErrors]) => {
|
||||
return resourceErrors.map(e => ({
|
||||
clusterName,
|
||||
error: e,
|
||||
}));
|
||||
})
|
||||
.sort(sortBySeverity);
|
||||
|
||||
return (
|
||||
|
||||
@@ -114,9 +114,8 @@ export const KubernetesContent = ({
|
||||
const podsWithErrors = new Set<string>(
|
||||
detectedErrors
|
||||
.get(item.cluster.name)
|
||||
?.filter(de => de.kind === 'Pod')
|
||||
.map(de => de.names)
|
||||
.flat() ?? [],
|
||||
?.filter(de => de.sourceRef.kind === 'Pod')
|
||||
.map(de => de.sourceRef.name),
|
||||
);
|
||||
|
||||
return (
|
||||
|
||||
@@ -185,7 +185,7 @@
|
||||
},
|
||||
"name": "other-side-car",
|
||||
"ready": false,
|
||||
"restartCount": 38,
|
||||
"restartCount": 123,
|
||||
"started": false,
|
||||
"state": {
|
||||
"waiting": {
|
||||
|
||||
@@ -14,59 +14,15 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import {
|
||||
DetectedError,
|
||||
ErrorDetectable,
|
||||
ErrorDetectableKind,
|
||||
ErrorMapper,
|
||||
} from './types';
|
||||
import { DetectedError, ErrorMapper } from './types';
|
||||
|
||||
// Run through the each error mapper for each object
|
||||
// returning a deduplicated (mostly) result
|
||||
export const detectErrorsInObjects = <T extends ErrorDetectable>(
|
||||
export const detectErrorsInObjects = <T>(
|
||||
objects: T[],
|
||||
kind: ErrorDetectableKind,
|
||||
clusterName: string,
|
||||
errorMappers: ErrorMapper<T>[],
|
||||
): DetectedError[] => {
|
||||
// Build up a map of errors
|
||||
// key: the joined message produced by an error
|
||||
// value: the error
|
||||
const errors = new Map<string, DetectedError>();
|
||||
|
||||
for (const object of objects) {
|
||||
for (const errorMapper of errorMappers) {
|
||||
if (errorMapper.errorExists(object)) {
|
||||
const message = errorMapper.messageAccessor(object);
|
||||
|
||||
// TODO This is not perfect as errors with uuid/hashes/date/times will not be caught by this
|
||||
const dedupKey = message.join('');
|
||||
|
||||
const value = errors.get(dedupKey);
|
||||
|
||||
const name = object.metadata?.name ?? 'unknown';
|
||||
const namespace = object.metadata?.namespace ?? 'unknown';
|
||||
|
||||
if (value !== undefined) {
|
||||
// This gets translated into the Chip "+5 others"
|
||||
// in the ErrorReporting component
|
||||
// but we need to keep the names so we can easily
|
||||
// find which objects owns the error later
|
||||
value.names.push(name);
|
||||
errors.set(dedupKey, value);
|
||||
} else {
|
||||
errors.set(dedupKey, {
|
||||
cluster: clusterName,
|
||||
kind: kind,
|
||||
names: [name],
|
||||
message: message,
|
||||
severity: errorMapper.severity,
|
||||
namespace,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(errors.values());
|
||||
return objects.flatMap(o => {
|
||||
return errorMappers.flatMap(em => em.detectErrors(o));
|
||||
});
|
||||
};
|
||||
|
||||
@@ -15,35 +15,33 @@
|
||||
*/
|
||||
|
||||
import { DetectedError, ErrorMapper } from './types';
|
||||
import { V1Deployment } from '@kubernetes/client-node';
|
||||
import { Deployment } from 'kubernetes-models/apps/v1';
|
||||
import { detectErrorsInObjects } from './common';
|
||||
|
||||
const deploymentErrorMappers: ErrorMapper<V1Deployment>[] = [
|
||||
const deploymentErrorMappers: ErrorMapper<Deployment>[] = [
|
||||
{
|
||||
// this is probably important
|
||||
severity: 6,
|
||||
errorExplanation: 'condition-message-present',
|
||||
errorExists: deployment => {
|
||||
return (deployment.status?.conditions ?? [])
|
||||
.filter(c => c.status === 'False')
|
||||
.some(c => c.message !== undefined);
|
||||
},
|
||||
messageAccessor: deployment => {
|
||||
detectErrors: deployment => {
|
||||
return (deployment.status?.conditions ?? [])
|
||||
.filter(c => c.status === 'False')
|
||||
.filter(c => c.message !== undefined)
|
||||
.map(c => c.message ?? '');
|
||||
.map(c => ({
|
||||
type: 'condition-message-present',
|
||||
message: c.message ?? '',
|
||||
severity: 6,
|
||||
proposedFix: [], // TODO next PR
|
||||
sourceRef: {
|
||||
name: deployment.metadata?.name ?? 'unknown hpa',
|
||||
namespace: deployment.metadata?.namespace ?? 'unknown namespace',
|
||||
kind: 'Deployment',
|
||||
apiGroup: 'apps/v1',
|
||||
},
|
||||
occuranceCount: 1,
|
||||
}));
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
export const detectErrorsInDeployments = (
|
||||
deployments: V1Deployment[],
|
||||
clusterName: string,
|
||||
deployments: Deployment[],
|
||||
): DetectedError[] =>
|
||||
detectErrorsInObjects(
|
||||
deployments,
|
||||
'Deployment',
|
||||
clusterName,
|
||||
deploymentErrorMappers,
|
||||
);
|
||||
detectErrorsInObjects(deployments, deploymentErrorMappers);
|
||||
|
||||
@@ -147,51 +147,61 @@ describe('detectErrors', () => {
|
||||
const [err1, err2, err3, err4] = errors ?? [];
|
||||
|
||||
expect(err1).toStrictEqual({
|
||||
cluster: 'cluster-a',
|
||||
kind: 'Pod',
|
||||
message: [
|
||||
'container=other-side-car restarted 38 times',
|
||||
'container=side-car restarted 38 times',
|
||||
],
|
||||
names: ['dice-roller-canary-7d64cd756c-55rfq'],
|
||||
namespace: 'default',
|
||||
sourceRef: {
|
||||
apiGroup: 'v1',
|
||||
kind: 'Pod',
|
||||
name: 'dice-roller-canary-7d64cd756c-55rfq',
|
||||
namespace: 'default',
|
||||
},
|
||||
message:
|
||||
'back-off 5m0s restarting failed container=other-side-car pod=dice-roller-canary-7d64cd756c-55rfq_default(65ad28e3-5d51-4b4b-9bf8-4cb069803034)',
|
||||
severity: 4,
|
||||
occuranceCount: 1,
|
||||
proposedFix: [],
|
||||
type: 'container-waiting',
|
||||
});
|
||||
|
||||
expect(err2).toStrictEqual({
|
||||
cluster: 'cluster-a',
|
||||
kind: 'Pod',
|
||||
message: [
|
||||
'containers with unready status: [side-car other-side-car]',
|
||||
'containers with unready status: [side-car other-side-car]',
|
||||
],
|
||||
names: ['dice-roller-canary-7d64cd756c-55rfq'],
|
||||
namespace: 'default',
|
||||
severity: 5,
|
||||
sourceRef: {
|
||||
apiGroup: 'v1',
|
||||
kind: 'Pod',
|
||||
name: 'dice-roller-canary-7d64cd756c-55rfq',
|
||||
namespace: 'default',
|
||||
},
|
||||
message:
|
||||
'back-off 5m0s restarting failed container=side-car pod=dice-roller-canary-7d64cd756c-55rfq_default(65ad28e3-5d51-4b4b-9bf8-4cb069803034)',
|
||||
severity: 4,
|
||||
occuranceCount: 1,
|
||||
proposedFix: [],
|
||||
type: 'container-waiting',
|
||||
});
|
||||
|
||||
expect(err3).toStrictEqual({
|
||||
cluster: 'cluster-a',
|
||||
kind: 'Pod',
|
||||
message: [
|
||||
'back-off 5m0s restarting failed container=other-side-car pod=dice-roller-canary-7d64cd756c-55rfq_default(65ad28e3-5d51-4b4b-9bf8-4cb069803034)',
|
||||
'back-off 5m0s restarting failed container=side-car pod=dice-roller-canary-7d64cd756c-55rfq_default(65ad28e3-5d51-4b4b-9bf8-4cb069803034)',
|
||||
],
|
||||
names: ['dice-roller-canary-7d64cd756c-55rfq'],
|
||||
namespace: 'default',
|
||||
severity: 6,
|
||||
sourceRef: {
|
||||
apiGroup: 'v1',
|
||||
kind: 'Pod',
|
||||
name: 'dice-roller-canary-7d64cd756c-55rfq',
|
||||
namespace: 'default',
|
||||
},
|
||||
message: 'container=other-side-car restarted 123 times',
|
||||
severity: 4,
|
||||
occuranceCount: 123,
|
||||
proposedFix: [],
|
||||
type: 'containers-restarting',
|
||||
});
|
||||
|
||||
expect(err4).toStrictEqual({
|
||||
cluster: 'cluster-a',
|
||||
kind: 'Pod',
|
||||
message: [
|
||||
'container=other-side-car exited with error code (1)',
|
||||
'container=side-car exited with error code (1)',
|
||||
],
|
||||
names: ['dice-roller-canary-7d64cd756c-55rfq'],
|
||||
namespace: 'default',
|
||||
sourceRef: {
|
||||
apiGroup: 'v1',
|
||||
kind: 'Pod',
|
||||
name: 'dice-roller-canary-7d64cd756c-55rfq',
|
||||
namespace: 'default',
|
||||
},
|
||||
message: 'container=side-car restarted 38 times',
|
||||
severity: 4,
|
||||
occuranceCount: 38,
|
||||
proposedFix: [],
|
||||
type: 'containers-restarting',
|
||||
});
|
||||
});
|
||||
it('should detect errors in pod with missing Config Map', () => {
|
||||
@@ -202,29 +212,22 @@ describe('detectErrors', () => {
|
||||
const errors = result.get(CLUSTER_NAME);
|
||||
|
||||
expect(errors).toBeDefined();
|
||||
expect(errors).toHaveLength(2);
|
||||
expect(errors).toHaveLength(1);
|
||||
|
||||
const [err1, err2] = errors ?? [];
|
||||
const [err1] = errors ?? [];
|
||||
|
||||
expect(err1).toStrictEqual({
|
||||
cluster: 'cluster-a',
|
||||
kind: 'Pod',
|
||||
message: [
|
||||
'containers with unready status: [nginx]',
|
||||
'containers with unready status: [nginx]',
|
||||
],
|
||||
names: ['dice-roller-bad-cm-855bf85464-mg6xb'],
|
||||
namespace: 'default',
|
||||
severity: 5,
|
||||
});
|
||||
|
||||
expect(err2).toStrictEqual({
|
||||
cluster: 'cluster-a',
|
||||
kind: 'Pod',
|
||||
message: ['configmap "some-cm" not found'],
|
||||
names: ['dice-roller-bad-cm-855bf85464-mg6xb'],
|
||||
namespace: 'default',
|
||||
severity: 6,
|
||||
message: 'configmap "some-cm" not found',
|
||||
occuranceCount: 1,
|
||||
proposedFix: [],
|
||||
severity: 4,
|
||||
sourceRef: {
|
||||
apiGroup: 'v1',
|
||||
kind: 'Pod',
|
||||
name: 'dice-roller-bad-cm-855bf85464-mg6xb',
|
||||
namespace: 'default',
|
||||
},
|
||||
type: 'container-waiting',
|
||||
});
|
||||
});
|
||||
it('should detect no errors in healthy deployment', () => {
|
||||
@@ -250,12 +253,17 @@ describe('detectErrors', () => {
|
||||
const [err1] = errors ?? [];
|
||||
|
||||
expect(err1).toStrictEqual({
|
||||
cluster: 'cluster-a',
|
||||
kind: 'Deployment',
|
||||
message: ['Deployment does not have minimum availability.'],
|
||||
names: ['dice-roller-canary'],
|
||||
namespace: 'default',
|
||||
sourceRef: {
|
||||
apiGroup: 'apps/v1',
|
||||
kind: 'Deployment',
|
||||
name: 'dice-roller-canary',
|
||||
namespace: 'default',
|
||||
},
|
||||
message: 'Deployment does not have minimum availability.',
|
||||
severity: 6,
|
||||
occuranceCount: 1,
|
||||
proposedFix: [],
|
||||
type: 'condition-message-present',
|
||||
});
|
||||
});
|
||||
it('should detect no errors in healthy hpa', () => {
|
||||
@@ -281,14 +289,159 @@ describe('detectErrors', () => {
|
||||
const [err1] = errors ?? [];
|
||||
|
||||
expect(err1).toStrictEqual({
|
||||
cluster: 'cluster-a',
|
||||
kind: 'HorizontalPodAutoscaler',
|
||||
message: [
|
||||
sourceRef: {
|
||||
apiGroup: 'autoscaling/v1',
|
||||
kind: 'HorizontalPodAutoscaler',
|
||||
name: 'dice-roller',
|
||||
namespace: 'default',
|
||||
},
|
||||
message:
|
||||
'Current number of replicas (10) is equal to the configured max number of replicas (10)',
|
||||
],
|
||||
names: ['dice-roller'],
|
||||
namespace: 'default',
|
||||
severity: 8,
|
||||
occuranceCount: 1,
|
||||
proposedFix: [],
|
||||
type: 'hpa-max-current-replicas',
|
||||
});
|
||||
});
|
||||
it('pending pod is not an error', async () => {
|
||||
const expiredReadiness = new Date();
|
||||
expiredReadiness.setFullYear(expiredReadiness.getFullYear() - 1);
|
||||
const result = await detectErrors(
|
||||
onePod({
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
name: 'some-container',
|
||||
readinessProbe: {
|
||||
initialDelaySeconds: 20000,
|
||||
failureThreshold: 5,
|
||||
periodSeconds: 5,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
status: {
|
||||
containerStatuses: [
|
||||
{
|
||||
name: 'some-container',
|
||||
image: 'some-image',
|
||||
imageID: 'some-image-id',
|
||||
restartCount: 0,
|
||||
containerID: 'running-container',
|
||||
ready: false,
|
||||
state: {
|
||||
running: {
|
||||
startedAt: new Date().toISOString() as any,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
message: 'Container running',
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const errors = result.get(CLUSTER_NAME);
|
||||
|
||||
expect(errors).toBeDefined();
|
||||
expect(errors).toHaveLength(0);
|
||||
});
|
||||
it('no probe pod has no errors', async () => {
|
||||
const expiredReadiness = new Date();
|
||||
expiredReadiness.setFullYear(expiredReadiness.getFullYear() - 1);
|
||||
const result = await detectErrors(
|
||||
onePod({
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
name: 'some-container',
|
||||
},
|
||||
],
|
||||
},
|
||||
status: {
|
||||
containerStatuses: [
|
||||
{
|
||||
name: 'some-container',
|
||||
image: 'some-image',
|
||||
imageID: 'some-image-id',
|
||||
restartCount: 0,
|
||||
containerID: 'running-container',
|
||||
ready: false,
|
||||
state: {
|
||||
running: {
|
||||
startedAt: new Date().toISOString() as any,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
message: 'Container running',
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const errors = result.get(CLUSTER_NAME);
|
||||
|
||||
expect(errors).toBeDefined();
|
||||
expect(errors).toHaveLength(0);
|
||||
});
|
||||
it('readiness probe failure results in error', async () => {
|
||||
const expiredReadiness = new Date();
|
||||
expiredReadiness.setFullYear(expiredReadiness.getFullYear() - 1);
|
||||
const result = await detectErrors(
|
||||
onePod({
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
name: 'some-container',
|
||||
readinessProbe: {
|
||||
initialDelaySeconds: 20,
|
||||
failureThreshold: 5,
|
||||
periodSeconds: 5,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
status: {
|
||||
containerStatuses: [
|
||||
{
|
||||
name: 'some-container',
|
||||
image: 'some-image',
|
||||
imageID: 'some-image-id',
|
||||
restartCount: 0,
|
||||
containerID: 'running-container',
|
||||
ready: false,
|
||||
state: {
|
||||
running: {
|
||||
startedAt: expiredReadiness.toISOString() as any,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
message: 'Container running',
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const errors = result.get(CLUSTER_NAME);
|
||||
|
||||
expect(errors).toBeDefined();
|
||||
expect(errors).toHaveLength(1);
|
||||
|
||||
const [err1] = errors ?? [];
|
||||
|
||||
expect(err1).toStrictEqual({
|
||||
message:
|
||||
'The container some-container failed to start properly, but is not crashing',
|
||||
occuranceCount: 1,
|
||||
proposedFix: [],
|
||||
severity: 4,
|
||||
sourceRef: {
|
||||
apiGroup: 'v1',
|
||||
kind: 'Pod',
|
||||
name: 'unknown pod',
|
||||
namespace: 'unknown namespace',
|
||||
},
|
||||
type: 'readiness-probe-taking-too-long',
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -20,6 +20,9 @@ import { groupResponses } from '../utils/response';
|
||||
import { detectErrorsInPods } from './pods';
|
||||
import { detectErrorsInDeployments } from './deployments';
|
||||
import { detectErrorsInHpa } from './hpas';
|
||||
import { Deployment } from 'kubernetes-models/apps/v1';
|
||||
import { HorizontalPodAutoscaler } from 'kubernetes-models/autoscaling/v1';
|
||||
import { Pod } from 'kubernetes-models/v1';
|
||||
|
||||
/**
|
||||
* For each cluster try to find errors in each of the object types provided
|
||||
@@ -38,20 +41,16 @@ export const detectErrors = (
|
||||
const groupedResponses = groupResponses(clusterResponse.resources);
|
||||
|
||||
clusterErrors = clusterErrors.concat(
|
||||
detectErrorsInPods(groupedResponses.pods, clusterResponse.cluster.name),
|
||||
detectErrorsInPods(groupedResponses.pods as Pod[]),
|
||||
);
|
||||
|
||||
clusterErrors = clusterErrors.concat(
|
||||
detectErrorsInDeployments(
|
||||
groupedResponses.deployments,
|
||||
clusterResponse.cluster.name,
|
||||
),
|
||||
detectErrorsInDeployments(groupedResponses.deployments as Deployment[]),
|
||||
);
|
||||
|
||||
clusterErrors = clusterErrors.concat(
|
||||
detectErrorsInHpa(
|
||||
groupedResponses.horizontalPodAutoscalers,
|
||||
clusterResponse.cluster.name,
|
||||
groupedResponses.horizontalPodAutoscalers as HorizontalPodAutoscaler[],
|
||||
),
|
||||
);
|
||||
|
||||
|
||||
@@ -14,37 +14,39 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import { V1HorizontalPodAutoscaler } from '@kubernetes/client-node';
|
||||
import { HorizontalPodAutoscaler } from 'kubernetes-models/autoscaling/v1';
|
||||
import { DetectedError, ErrorMapper } from './types';
|
||||
import { detectErrorsInObjects } from './common';
|
||||
|
||||
const hpaErrorMappers: ErrorMapper<V1HorizontalPodAutoscaler>[] = [
|
||||
const hpaErrorMappers: ErrorMapper<HorizontalPodAutoscaler>[] = [
|
||||
{
|
||||
// this is probably important
|
||||
severity: 8,
|
||||
errorExplanation: 'hpa-max-current-replicas',
|
||||
errorExists: hpa => {
|
||||
return (hpa.spec?.maxReplicas ?? -1) === hpa.status?.currentReplicas;
|
||||
},
|
||||
messageAccessor: hpa => {
|
||||
return [
|
||||
`Current number of replicas (${
|
||||
hpa.status?.currentReplicas
|
||||
}) is equal to the configured max number of replicas (${
|
||||
hpa.spec?.maxReplicas ?? -1
|
||||
})`,
|
||||
];
|
||||
detectErrors: hpa => {
|
||||
if ((hpa.spec?.maxReplicas ?? -1) === hpa.status?.currentReplicas) {
|
||||
return [
|
||||
{
|
||||
type: 'hpa-max-current-replicas',
|
||||
message: `Current number of replicas (${
|
||||
hpa.status?.currentReplicas
|
||||
}) is equal to the configured max number of replicas (${
|
||||
hpa.spec?.maxReplicas ?? -1
|
||||
})`,
|
||||
severity: 8,
|
||||
proposedFix: [], // TODO next PR
|
||||
sourceRef: {
|
||||
name: hpa.metadata?.name ?? 'unknown hpa',
|
||||
namespace: hpa.metadata?.namespace ?? 'unknown namespace',
|
||||
kind: 'HorizontalPodAutoscaler',
|
||||
apiGroup: 'autoscaling/v1',
|
||||
},
|
||||
occuranceCount: 1,
|
||||
},
|
||||
];
|
||||
}
|
||||
return [];
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
export const detectErrorsInHpa = (
|
||||
hpas: V1HorizontalPodAutoscaler[],
|
||||
clusterName: string,
|
||||
): DetectedError[] =>
|
||||
detectErrorsInObjects(
|
||||
hpas,
|
||||
'HorizontalPodAutoscaler',
|
||||
clusterName,
|
||||
hpaErrorMappers,
|
||||
);
|
||||
hpas: HorizontalPodAutoscaler[],
|
||||
): DetectedError[] => detectErrorsInObjects(hpas, hpaErrorMappers);
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
*/
|
||||
|
||||
export type {
|
||||
ErrorDetectableKind,
|
||||
DetectedError,
|
||||
DetectedErrorsByCluster,
|
||||
ErrorSeverity,
|
||||
|
||||
@@ -14,82 +14,121 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import { V1Pod } from '@kubernetes/client-node';
|
||||
import { totalRestarts } from '../utils/pod';
|
||||
import { Pod, IContainerStatus, IContainer } from 'kubernetes-models/v1';
|
||||
import { DetectedError, ErrorMapper } from './types';
|
||||
import { detectErrorsInObjects } from './common';
|
||||
import lodash from 'lodash';
|
||||
import { DateTime } from 'luxon';
|
||||
|
||||
const podErrorMappers: ErrorMapper<V1Pod>[] = [
|
||||
function isPodReadinessProbeUnready({
|
||||
container,
|
||||
containerStatus,
|
||||
}: ContainerSpecAndStatus): boolean {
|
||||
if (
|
||||
containerStatus.ready ||
|
||||
containerStatus.state?.running?.startedAt === undefined ||
|
||||
!container.readinessProbe
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
const startDateTime = DateTime.fromISO(
|
||||
containerStatus.state?.running?.startedAt,
|
||||
)
|
||||
// Add initial delay
|
||||
.plus({
|
||||
seconds: container.readinessProbe?.initialDelaySeconds ?? 0,
|
||||
})
|
||||
// Add failure threshold
|
||||
.plus({
|
||||
seconds:
|
||||
(container.readinessProbe?.periodSeconds ?? 0) *
|
||||
(container.readinessProbe?.failureThreshold ?? 0),
|
||||
});
|
||||
return startDateTime < DateTime.now();
|
||||
}
|
||||
|
||||
interface ContainerSpecAndStatus {
|
||||
container: IContainer;
|
||||
containerStatus: IContainerStatus;
|
||||
}
|
||||
|
||||
const podToContainerSpecsAndStatuses = (pod: Pod): ContainerSpecAndStatus[] => {
|
||||
const specs = lodash.groupBy(pod.spec?.containers ?? [], value => value.name);
|
||||
|
||||
const result: ContainerSpecAndStatus[] = [];
|
||||
|
||||
for (const cs of pod.status?.containerStatuses ?? []) {
|
||||
const spec = specs[cs.name];
|
||||
if (spec.length > 0) {
|
||||
result.push({
|
||||
container: spec[0],
|
||||
containerStatus: cs,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
const podErrorMappers: ErrorMapper<Pod>[] = [
|
||||
{
|
||||
severity: 5,
|
||||
errorExplanation: 'status-message',
|
||||
errorExists: pod => {
|
||||
return pod.status?.message !== undefined;
|
||||
},
|
||||
messageAccessor: pod => {
|
||||
return [pod.status?.message ?? ''];
|
||||
detectErrors: pod => {
|
||||
return podToContainerSpecsAndStatuses(pod)
|
||||
.filter(isPodReadinessProbeUnready)
|
||||
.map(cs => ({
|
||||
type: 'readiness-probe-taking-too-long',
|
||||
message: `The container ${cs.container.name} failed to start properly, but is not crashing`,
|
||||
severity: 4,
|
||||
proposedFix: [], // TODO next PR
|
||||
sourceRef: {
|
||||
name: pod.metadata?.name ?? 'unknown pod',
|
||||
namespace: pod.metadata?.namespace ?? 'unknown namespace',
|
||||
kind: 'Pod',
|
||||
apiGroup: 'v1',
|
||||
},
|
||||
occuranceCount: 1,
|
||||
}));
|
||||
},
|
||||
},
|
||||
{
|
||||
severity: 4,
|
||||
errorExplanation: 'containers-restarting',
|
||||
errorExists: pod => {
|
||||
// TODO magic number
|
||||
return totalRestarts(pod) > 3;
|
||||
},
|
||||
messageAccessor: pod => {
|
||||
return (pod.status?.containerStatuses ?? [])
|
||||
.filter(cs => cs.restartCount > 0)
|
||||
.map(cs => `container=${cs.name} restarted ${cs.restartCount} times`);
|
||||
},
|
||||
},
|
||||
{
|
||||
severity: 5,
|
||||
errorExplanation: 'condition-message-present',
|
||||
errorExists: pod => {
|
||||
return (pod.status?.conditions ?? []).some(c => c.message !== undefined);
|
||||
},
|
||||
messageAccessor: pod => {
|
||||
return (pod.status?.conditions ?? [])
|
||||
.filter(c => c.message !== undefined)
|
||||
.map(c => c.message ?? '');
|
||||
},
|
||||
},
|
||||
{
|
||||
severity: 6,
|
||||
errorExplanation: 'container-waiting',
|
||||
errorExists: pod => {
|
||||
return (pod.status?.containerStatuses ?? []).some(
|
||||
cs => cs.state?.waiting?.message !== undefined,
|
||||
);
|
||||
},
|
||||
messageAccessor: pod => {
|
||||
detectErrors: pod => {
|
||||
return (pod.status?.containerStatuses ?? [])
|
||||
.filter(cs => cs.state?.waiting?.message !== undefined)
|
||||
.map(cs => cs.state?.waiting?.message ?? '');
|
||||
.map(cs => ({
|
||||
type: 'container-waiting',
|
||||
message: cs.state?.waiting?.message ?? 'container waiting',
|
||||
severity: 4,
|
||||
proposedFix: [], // TODO next PR
|
||||
sourceRef: {
|
||||
name: pod.metadata?.name ?? 'unknown pod',
|
||||
namespace: pod.metadata?.namespace ?? 'unknown namespace',
|
||||
kind: 'Pod',
|
||||
apiGroup: 'v1',
|
||||
},
|
||||
occuranceCount: 1,
|
||||
}));
|
||||
},
|
||||
},
|
||||
{
|
||||
severity: 4,
|
||||
errorExplanation: 'container-last-state-error',
|
||||
errorExists: pod => {
|
||||
return (pod.status?.containerStatuses ?? []).some(
|
||||
cs => (cs.lastState?.terminated?.reason ?? '') === 'Error',
|
||||
);
|
||||
},
|
||||
messageAccessor: pod => {
|
||||
detectErrors: pod => {
|
||||
return (pod.status?.containerStatuses ?? [])
|
||||
.filter(cs => (cs.lastState?.terminated?.reason ?? '') === 'Error')
|
||||
.map(
|
||||
cs =>
|
||||
`container=${cs.name} exited with error code (${cs.lastState?.terminated?.exitCode})`,
|
||||
);
|
||||
.filter(cs => cs.restartCount > 0)
|
||||
.map(cs => ({
|
||||
type: 'containers-restarting',
|
||||
message: `container=${cs.name} restarted ${cs.restartCount} times`,
|
||||
severity: 4,
|
||||
proposedFix: [], // TODO next PR
|
||||
sourceRef: {
|
||||
name: pod.metadata?.name ?? 'unknown pod',
|
||||
namespace: pod.metadata?.namespace ?? 'unknown namespace',
|
||||
kind: 'Pod',
|
||||
apiGroup: 'v1',
|
||||
},
|
||||
occuranceCount: cs.restartCount,
|
||||
}));
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
export const detectErrorsInPods = (
|
||||
pods: V1Pod[],
|
||||
clusterName: string,
|
||||
): DetectedError[] =>
|
||||
detectErrorsInObjects(pods, 'Pod', clusterName, podErrorMappers);
|
||||
export const detectErrorsInPods = (pods: Pod[]): DetectedError[] =>
|
||||
detectErrorsInObjects(pods, podErrorMappers);
|
||||
|
||||
@@ -14,13 +14,6 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Higher is more sever, but it's relative
|
||||
import {
|
||||
V1Deployment,
|
||||
V1HorizontalPodAutoscaler,
|
||||
V1Pod,
|
||||
} from '@kubernetes/client-node';
|
||||
|
||||
/**
|
||||
* Severity of the error, where 10 is critical and 0 is very low.
|
||||
*
|
||||
@@ -28,18 +21,6 @@ import {
|
||||
*/
|
||||
export type ErrorSeverity = 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10;
|
||||
|
||||
export type ErrorDetectable = V1Pod | V1Deployment | V1HorizontalPodAutoscaler;
|
||||
|
||||
/**
|
||||
* Kubernetes kinds that errors might be reported by the plugin
|
||||
*
|
||||
* @public
|
||||
*/
|
||||
export type ErrorDetectableKind =
|
||||
| 'Pod'
|
||||
| 'Deployment'
|
||||
| 'HorizontalPodAutoscaler';
|
||||
|
||||
/**
|
||||
* A list of errors keyed by Cluster name
|
||||
*
|
||||
@@ -47,23 +28,51 @@ export type ErrorDetectableKind =
|
||||
*/
|
||||
export type DetectedErrorsByCluster = Map<string, DetectedError[]>;
|
||||
|
||||
export interface ResourceRef {
|
||||
name: string;
|
||||
namespace: string;
|
||||
kind: string;
|
||||
apiGroup: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents an error found on a Kubernetes object
|
||||
*
|
||||
* @public
|
||||
*/
|
||||
export interface DetectedError {
|
||||
type: string;
|
||||
severity: ErrorSeverity;
|
||||
cluster: string;
|
||||
namespace: string;
|
||||
kind: ErrorDetectableKind;
|
||||
names: string[];
|
||||
message: string[];
|
||||
message: string;
|
||||
proposedFix: ProposedFix[];
|
||||
sourceRef: ResourceRef;
|
||||
occuranceCount: number;
|
||||
}
|
||||
|
||||
export interface ErrorMapper<T extends ErrorDetectable> {
|
||||
severity: ErrorSeverity;
|
||||
errorExplanation: string;
|
||||
errorExists: (object: T) => boolean;
|
||||
messageAccessor: (object: T) => string[];
|
||||
type ProposedFix = LogSolution | DocsSolution | EventsSolution;
|
||||
|
||||
interface ProposedFixBase {
|
||||
errorType: string;
|
||||
rootCauseExplanation: string;
|
||||
possibleFixes: string[];
|
||||
}
|
||||
|
||||
export interface LogSolution extends ProposedFixBase {
|
||||
type: 'logs';
|
||||
container: string;
|
||||
}
|
||||
|
||||
export interface DocsSolution extends ProposedFixBase {
|
||||
type: 'docs';
|
||||
docsLink: string;
|
||||
}
|
||||
|
||||
export interface EventsSolution extends ProposedFixBase {
|
||||
type: 'events';
|
||||
docsLink: string;
|
||||
podName: string;
|
||||
}
|
||||
|
||||
export interface ErrorMapper<T> {
|
||||
detectErrors: (resource: T) => DetectedError[];
|
||||
}
|
||||
|
||||
@@ -7173,6 +7173,7 @@ __metadata:
|
||||
cronstrue: ^2.2.0
|
||||
cross-fetch: ^3.1.5
|
||||
js-yaml: ^4.0.0
|
||||
kubernetes-models: ^4.1.0
|
||||
lodash: ^4.17.21
|
||||
luxon: ^3.0.0
|
||||
msw: ^1.0.0
|
||||
@@ -11926,6 +11927,39 @@ __metadata:
|
||||
languageName: node
|
||||
linkType: hard
|
||||
|
||||
"@kubernetes-models/apimachinery@npm:^1.1.0":
|
||||
version: 1.1.0
|
||||
resolution: "@kubernetes-models/apimachinery@npm:1.1.0"
|
||||
dependencies:
|
||||
"@kubernetes-models/base": ^4.0.0
|
||||
"@kubernetes-models/validate": ^3.0.0
|
||||
tslib: ^2.4.0
|
||||
checksum: 71fc127a8e50c3686e32d9f9df94a307ae4d69995d6e82ea50a7eb9a5ce6a18e9950700ad0eb445e1451c997aabb22e2594ab1037c11faa9c0051b51bab569c6
|
||||
languageName: node
|
||||
linkType: hard
|
||||
|
||||
"@kubernetes-models/base@npm:^4.0.0":
|
||||
version: 4.0.0
|
||||
resolution: "@kubernetes-models/base@npm:4.0.0"
|
||||
dependencies:
|
||||
"@kubernetes-models/validate": ^3.0.0
|
||||
is-plain-object: ^5.0.0
|
||||
tslib: ^2.4.0
|
||||
checksum: 3c44f2220e119c24b874d6d674f38f9796f0ab682ba0cff1516d39549a6536055348bcee6c3c5257ebb8d4d6c8d1865d84e519d8f8948e6dd29eb1d72f3830cb
|
||||
languageName: node
|
||||
linkType: hard
|
||||
|
||||
"@kubernetes-models/validate@npm:^3.0.0":
|
||||
version: 3.0.0
|
||||
resolution: "@kubernetes-models/validate@npm:3.0.0"
|
||||
dependencies:
|
||||
ajv: ^8.11.0
|
||||
ajv-formats: ^2.1.1
|
||||
tslib: ^2.4.0
|
||||
checksum: b53b7181ddaad7e8faf6ceac228cb23df239ef5da3784887f3378dd3a7f89e1936b7b1884ab123255a905627311bab4302a06668fba35d26152e69fbf208a855
|
||||
languageName: node
|
||||
linkType: hard
|
||||
|
||||
"@kubernetes/client-node@npm:0.18.1":
|
||||
version: 0.18.1
|
||||
resolution: "@kubernetes/client-node@npm:0.18.1"
|
||||
@@ -17602,7 +17636,7 @@ __metadata:
|
||||
languageName: node
|
||||
linkType: hard
|
||||
|
||||
"ajv@npm:^8.0.0, ajv@npm:^8.10.0, ajv@npm:^8.12.0, ajv@npm:^8.8.0":
|
||||
"ajv@npm:^8.0.0, ajv@npm:^8.10.0, ajv@npm:^8.11.0, ajv@npm:^8.12.0, ajv@npm:^8.8.0":
|
||||
version: 8.12.0
|
||||
resolution: "ajv@npm:8.12.0"
|
||||
dependencies:
|
||||
@@ -28790,6 +28824,18 @@ __metadata:
|
||||
languageName: node
|
||||
linkType: hard
|
||||
|
||||
"kubernetes-models@npm:^4.1.0":
|
||||
version: 4.1.0
|
||||
resolution: "kubernetes-models@npm:4.1.0"
|
||||
dependencies:
|
||||
"@kubernetes-models/apimachinery": ^1.1.0
|
||||
"@kubernetes-models/base": ^4.0.0
|
||||
"@kubernetes-models/validate": ^3.0.0
|
||||
tslib: ^2.4.0
|
||||
checksum: 1c3195a78ca422a6436f9c4143288a4d29f53b75ff597d72f44cc7f274714daab40a5bb99dd13e02edf3ce11a41f1df34d3d0bb1930b6401cc21c44666ad2b6b
|
||||
languageName: node
|
||||
linkType: hard
|
||||
|
||||
"kuler@npm:^2.0.0":
|
||||
version: 2.0.0
|
||||
resolution: "kuler@npm:2.0.0"
|
||||
|
||||
Reference in New Issue
Block a user