diff --git a/README.md b/README.md index d9265feb4..38a593583 100644 --- a/README.md +++ b/README.md @@ -635,3 +635,84 @@ Note : Here node-id refers to the node that has the unhealthy etcd member. This ``` osdctl swarm secondary ``` + +### Feature Testing Evidence Collection + +These commands help SRE teams collect evidence during feature validation testing (IAM policies, operators, etc.). + +#### CloudTrail Errors + +Surface permission errors and other AWS API errors from CloudTrail. Useful when validating new IAM policies or features that interact with AWS APIs. + +```bash +# Get permission errors from the last hour +osdctl cloudtrail errors -C --since 1h + +# Get errors from the last 30 minutes with JSON output +osdctl cloudtrail errors -C --since 30m --json + +# Get errors with AWS console links +osdctl cloudtrail errors -C --since 2h --url + +# Filter for specific error types +osdctl cloudtrail errors -C --since 1h --error-types AccessDenied,UnauthorizedOperation +``` + +**Note:** For ROSA HCP clusters, CloudTrail events only show customer account activity. Control plane activity is in Red Hat's account and not visible. + +#### Cluster Snapshot + +Capture a point-in-time snapshot of cluster state for evidence collection. The snapshot includes nodes, ClusterOperators, and namespaces. + +```bash +# Capture cluster snapshot to a file +osdctl cluster snapshot -C -o before.yaml + +# Capture snapshot with specific namespaces +osdctl cluster snapshot -C -o snapshot.yaml --namespaces openshift-monitoring,openshift-operators + +# Capture additional resource types +osdctl cluster snapshot -C -o snapshot.yaml --resources pods,deployments,services +``` + +#### Cluster Diff + +Compare two cluster snapshots to identify changes. Useful for understanding what changed during feature testing. + +```bash +# Compare two snapshots +osdctl cluster diff before.yaml after.yaml + +# Compare snapshots with JSON output +osdctl cluster diff before.yaml after.yaml --json +``` + +Changes are categorized as: +- `+` added: Resource exists in after but not in before +- `-` removed: Resource exists in before but not in after +- `~` modified: Resource exists in both but with different values + +#### Evidence Collection (All-in-One) + +Collect comprehensive evidence from a cluster and AWS for feature testing. This all-in-one command gathers cluster state, CloudTrail events, and optionally Kubernetes events and must-gather output. + +```bash +# Collect all evidence to a directory +osdctl evidence collect -C --output ./evidence/ + +# Collect evidence from the last 2 hours +osdctl evidence collect -C --output ./evidence/ --since 2h + +# Collect evidence without CloudTrail (for non-AWS or limited access) +osdctl evidence collect -C --output ./evidence/ --skip-cloudtrail + +# Include Kubernetes events in collection +osdctl evidence collect -C --output ./evidence/ --include-events + +# Include must-gather output +osdctl evidence collect -C --output ./evidence/ --include-must-gather +``` + +The collected evidence includes: +- `evidence.yaml` - Main evidence file with cluster state and CloudTrail data +- `summary.txt` - Human-readable summary of findings diff --git a/cmd/cloudtrail/cmd.go b/cmd/cloudtrail/cmd.go index 6cf52089f..406f3c729 100644 --- a/cmd/cloudtrail/cmd.go +++ b/cmd/cloudtrail/cmd.go @@ -16,6 +16,7 @@ func NewCloudtrailCmd() *cobra.Command { cloudtrailCmd.AddCommand(newCmdWriteEvents()) cloudtrailCmd.AddCommand(newCmdPermissionDenied()) + cloudtrailCmd.AddCommand(newCmdErrors()) return cloudtrailCmd } diff --git a/cmd/cloudtrail/errors.go b/cmd/cloudtrail/errors.go new file mode 100644 index 000000000..538b27bae --- /dev/null +++ b/cmd/cloudtrail/errors.go @@ -0,0 +1,322 @@ +package cloudtrail + +import ( + "encoding/json" + "fmt" + "regexp" + "strings" + "time" + + "github.com/aws/aws-sdk-go-v2/service/cloudtrail/types" + "github.com/aws/aws-sdk-go-v2/service/sts" + "github.com/openshift/osdctl/pkg/osdCloud" + "github.com/openshift/osdctl/pkg/utils" + "github.com/spf13/cobra" +) + +// Default error patterns to match for IAM/permission issues +var defaultErrorPatterns = []string{ + "AccessDenied", + "UnauthorizedOperation", + "Client.UnauthorizedOperation", + "Forbidden", + "InvalidClientTokenId", + "AuthFailure", + "ExpiredToken", + "SignatureDoesNotMatch", +} + +type errorsOptions struct { + ClusterID string + StartTime string + PrintUrl bool + PrintRaw bool + JSONOutput bool + ErrorTypes []string +} + +type errorEventOutput struct { + EventName string `json:"eventName"` + EventTime string `json:"eventTime"` + ErrorCode string `json:"errorCode"` + UserARN string `json:"userArn,omitempty"` + UserName string `json:"userName,omitempty"` + Region string `json:"region,omitempty"` + ConsoleLink string `json:"consoleLink,omitempty"` +} + +func newCmdErrors() *cobra.Command { + opts := &errorsOptions{} + + errorsCmd := &cobra.Command{ + Use: "errors", + Short: "Prints CloudTrail error events (permission/IAM issues) to console.", + Long: `Surfaces permission and IAM-related errors from AWS CloudTrail. + +By default, matches these error patterns: + - AccessDenied + - UnauthorizedOperation / Client.UnauthorizedOperation + - Forbidden + - InvalidClientTokenId + - AuthFailure + - ExpiredToken + - SignatureDoesNotMatch + +Use --error-types to filter for specific error patterns.`, + Example: ` # Check for permission errors in the last hour + osdctl cloudtrail errors -C --since 1h + + # Check for specific error types only + osdctl cloudtrail errors -C --error-types AccessDenied,Forbidden + + # Output as JSON for scripting + osdctl cloudtrail errors -C --json + + # Include console links for each event + osdctl cloudtrail errors -C --url`, + RunE: func(cmd *cobra.Command, args []string) error { + return opts.run() + }, + } + + errorsCmd.Flags().StringVarP(&opts.ClusterID, "cluster-id", "C", "", "Cluster ID") + errorsCmd.Flags().StringVarP(&opts.StartTime, "since", "", "1h", "Time window to search (e.g., 30m, 1h, 24h). Valid units: ns, us, ms, s, m, h.") + errorsCmd.Flags().BoolVarP(&opts.PrintUrl, "url", "u", false, "Include console URL links for each event") + errorsCmd.Flags().BoolVarP(&opts.PrintRaw, "raw-event", "r", false, "Print raw CloudTrail event JSON") + errorsCmd.Flags().BoolVar(&opts.JSONOutput, "json", false, "Output results as JSON") + errorsCmd.Flags().StringSliceVar(&opts.ErrorTypes, "error-types", nil, "Comma-separated list of error patterns to match (default: all common permission errors)") + _ = errorsCmd.MarkFlagRequired("cluster-id") + + return errorsCmd +} + +func (o *errorsOptions) run() error { + err := utils.IsValidClusterKey(o.ClusterID) + if err != nil { + return err + } + + connection, err := utils.CreateConnection() + if err != nil { + return fmt.Errorf("unable to create connection to OCM: %w", err) + } + defer connection.Close() + + cluster, err := utils.GetClusterAnyStatus(connection, o.ClusterID) + if err != nil { + return err + } + + if strings.ToUpper(cluster.CloudProvider().ID()) != "AWS" { + return fmt.Errorf("this command is only available for AWS clusters") + } + + cfg, err := osdCloud.CreateAWSV2Config(connection, cluster) + if err != nil { + return err + } + + startTime, err := parseDurationToUTC(o.StartTime) + if err != nil { + return err + } + + arn, accountID, err := Whoami(*sts.NewFromConfig(cfg)) + if err != nil { + return err + } + + // Build error patterns to match + patterns := defaultErrorPatterns + if len(o.ErrorTypes) > 0 { + patterns = o.ErrorTypes + } + + if !o.JSONOutput { + fmt.Printf("[INFO] Checking error history since %v for AWS Account %v as %v\n", startTime.Format(time.RFC3339), accountID, arn) + fmt.Printf("[INFO] Matching error patterns: %v\n", patterns) + fmt.Printf("[INFO] Fetching CloudTrail error events from %v region...\n", cfg.Region) + } + + awsAPI := NewEventAPI(cfg, false, cfg.Region) + requestTime := Period{StartTime: startTime, EndTime: time.Now().UTC()} + generator := awsAPI.GetEvents(o.ClusterID, requestTime) + + var allEvents []errorEventOutput + eventCount := 0 + + // Process events from cluster region + for page := range generator { + filteredEvents, err := ApplyFilters(page.AWSEvent, + func(event types.Event) (bool, error) { + return o.isErrorEvent(event, patterns) + }, + ) + if err != nil { + return err + } + + if o.JSONOutput { + for _, event := range filteredEvents { + output := o.eventToOutput(event, cfg.Region) + allEvents = append(allEvents, output) + } + } else if o.PrintRaw { + for _, event := range filteredEvents { + if event.CloudTrailEvent != nil { + fmt.Println(*event.CloudTrailEvent) + } + } + } else if len(filteredEvents) > 0 { + o.printEvents(filteredEvents, cfg.Region) + } + eventCount += len(filteredEvents) + } + + // Also check global region if different + if DEFAULT_REGION != cfg.Region { + defaultAwsAPI := NewEventAPI(cfg, true, DEFAULT_REGION) + + if !o.JSONOutput && !o.PrintRaw { + fmt.Printf("[INFO] Fetching CloudTrail error events from %v region...\n", DEFAULT_REGION) + } + + generator := defaultAwsAPI.GetEvents(o.ClusterID, requestTime) + + for page := range generator { + filteredEvents, err := ApplyFilters(page.AWSEvent, + func(event types.Event) (bool, error) { + return o.isErrorEvent(event, patterns) + }, + ) + if err != nil { + return err + } + + if o.JSONOutput { + for _, event := range filteredEvents { + output := o.eventToOutput(event, DEFAULT_REGION) + allEvents = append(allEvents, output) + } + } else if o.PrintRaw { + for _, event := range filteredEvents { + if event.CloudTrailEvent != nil { + fmt.Println(*event.CloudTrailEvent) + } + } + } else if len(filteredEvents) > 0 { + o.printEvents(filteredEvents, DEFAULT_REGION) + } + eventCount += len(filteredEvents) + } + } + + if o.JSONOutput { + output, err := json.MarshalIndent(allEvents, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal JSON output: %w", err) + } + fmt.Println(string(output)) + } else { + fmt.Printf("\n[INFO] Found %d error event(s)\n", eventCount) + } + + return nil +} + +func (o *errorsOptions) isErrorEvent(event types.Event, patterns []string) (bool, error) { + raw, err := ExtractUserDetails(event.CloudTrailEvent) + if err != nil { + return false, fmt.Errorf("failed to extract CloudTrail event details: %w", err) + } + + errorCode := raw.ErrorCode + if errorCode == "" { + return false, nil + } + + for _, pattern := range patterns { + check, err := regexp.Compile("(?i)" + regexp.QuoteMeta(pattern)) + if err != nil { + return false, fmt.Errorf("failed to compile regex for pattern %s: %w", pattern, err) + } + if check.MatchString(errorCode) { + return true, nil + } + } + + return false, nil +} + +func (o *errorsOptions) eventToOutput(event types.Event, region string) errorEventOutput { + output := errorEventOutput{ + Region: region, + } + + if event.EventName != nil { + output.EventName = *event.EventName + } + if event.EventTime != nil { + output.EventTime = event.EventTime.Format(time.RFC3339) + } + + raw, err := ExtractUserDetails(event.CloudTrailEvent) + if err == nil { + output.ErrorCode = raw.ErrorCode + output.UserARN = raw.UserIdentity.SessionContext.SessionIssuer.Arn + output.UserName = raw.UserIdentity.SessionContext.SessionIssuer.UserName + } + + if o.PrintUrl && event.EventId != nil { + output.ConsoleLink = fmt.Sprintf("https://%s.console.aws.amazon.com/cloudtrailv2/home?region=%s#/events/%s", + region, region, *event.EventId) + } + + return output +} + +func (o *errorsOptions) printEvents(events []types.Event, region string) { + for _, event := range events { + fmt.Println("─────────────────────────────────────────────────────────────") + + if event.EventName != nil { + fmt.Printf("Event: %s\n", *event.EventName) + } + if event.EventTime != nil { + fmt.Printf("Time: %s\n", event.EventTime.Format(time.RFC3339)) + } + + raw, err := ExtractUserDetails(event.CloudTrailEvent) + if err == nil { + if raw.ErrorCode != "" { + fmt.Printf("Error: %s\n", raw.ErrorCode) + } + userName := raw.UserIdentity.SessionContext.SessionIssuer.UserName + if userName != "" { + fmt.Printf("User: %s\n", userName) + } + userArn := raw.UserIdentity.SessionContext.SessionIssuer.Arn + if userArn != "" { + fmt.Printf("ARN: %s\n", userArn) + } + } + + fmt.Printf("Region: %s\n", region) + + if o.PrintUrl && event.EventId != nil { + fmt.Printf("Console: https://%s.console.aws.amazon.com/cloudtrailv2/home?region=%s#/events/%s\n", + region, region, *event.EventId) + } + } +} + +func parseDurationToUTC(since string) (time.Time, error) { + duration, err := time.ParseDuration(since) + if err != nil { + return time.Time{}, fmt.Errorf("invalid duration format %q: %w", since, err) + } + if duration <= 0 { + return time.Time{}, fmt.Errorf("duration must be positive, got %q", since) + } + return time.Now().UTC().Add(-duration), nil +} diff --git a/cmd/cluster/cmd.go b/cmd/cluster/cmd.go index d32765222..e429c693b 100644 --- a/cmd/cluster/cmd.go +++ b/cmd/cluster/cmd.go @@ -50,6 +50,8 @@ func NewCmdCluster(streams genericclioptions.IOStreams, client *k8s.LazyClient, clusterCmd.AddCommand(newCmdGetEnvVars()) clusterCmd.AddCommand(reports.NewCmdReports()) clusterCmd.AddCommand(cad.NewCmdCad()) + clusterCmd.AddCommand(newCmdSnapshot()) + clusterCmd.AddCommand(newCmdDiff()) return clusterCmd } diff --git a/cmd/cluster/diff.go b/cmd/cluster/diff.go new file mode 100644 index 000000000..6d6aef5c6 --- /dev/null +++ b/cmd/cluster/diff.go @@ -0,0 +1,579 @@ +package cluster + +import ( + "encoding/json" + "fmt" + "os" + "strings" + + "github.com/spf13/cobra" + "gopkg.in/yaml.v3" +) + +// DiffResult represents the comparison between two snapshots +type DiffResult struct { + BeforeSnapshot string `yaml:"beforeSnapshot" json:"beforeSnapshot"` + AfterSnapshot string `yaml:"afterSnapshot" json:"afterSnapshot"` + Summary DiffSummary `yaml:"summary" json:"summary"` + NodeChanges []NodeDiff `yaml:"nodeChanges,omitempty" json:"nodeChanges,omitempty"` + OperatorChanges []OperatorDiff `yaml:"operatorChanges,omitempty" json:"operatorChanges,omitempty"` + NamespaceChanges []NamespaceDiff `yaml:"namespaceChanges,omitempty" json:"namespaceChanges,omitempty"` + ResourceChanges map[string][]ResourceDiff `yaml:"resourceChanges,omitempty" json:"resourceChanges,omitempty"` +} + +// DiffSummary provides high-level change counts +type DiffSummary struct { + TotalChanges int `yaml:"totalChanges" json:"totalChanges"` + NodesChanged int `yaml:"nodesChanged" json:"nodesChanged"` + OperatorsChanged int `yaml:"operatorsChanged" json:"operatorsChanged"` + NamespacesChanged int `yaml:"namespacesChanged" json:"namespacesChanged"` + ResourcesChanged int `yaml:"resourcesChanged" json:"resourcesChanged"` +} + +// NodeDiff represents changes to a node +type NodeDiff struct { + Name string `yaml:"name" json:"name"` + ChangeType string `yaml:"changeType" json:"changeType"` // added, removed, modified + Before string `yaml:"before,omitempty" json:"before,omitempty"` + After string `yaml:"after,omitempty" json:"after,omitempty"` +} + +// OperatorDiff represents changes to a ClusterOperator +type OperatorDiff struct { + Name string `yaml:"name" json:"name"` + ChangeType string `yaml:"changeType" json:"changeType"` + Field string `yaml:"field,omitempty" json:"field,omitempty"` + Before string `yaml:"before,omitempty" json:"before,omitempty"` + After string `yaml:"after,omitempty" json:"after,omitempty"` +} + +// NamespaceDiff represents changes to a namespace +type NamespaceDiff struct { + Name string `yaml:"name" json:"name"` + ChangeType string `yaml:"changeType" json:"changeType"` + Before string `yaml:"before,omitempty" json:"before,omitempty"` + After string `yaml:"after,omitempty" json:"after,omitempty"` +} + +// ResourceDiff represents changes to a resource +type ResourceDiff struct { + Name string `yaml:"name" json:"name"` + Namespace string `yaml:"namespace,omitempty" json:"namespace,omitempty"` + ChangeType string `yaml:"changeType" json:"changeType"` + Before string `yaml:"before,omitempty" json:"before,omitempty"` + After string `yaml:"after,omitempty" json:"after,omitempty"` +} + +// diffOptions holds the options for the diff command +type diffOptions struct { + BeforeFile string + AfterFile string + OutputJSON bool +} + +func newCmdDiff() *cobra.Command { + opts := &diffOptions{} + + diffCmd := &cobra.Command{ + Use: "diff ", + Short: "Compare two cluster snapshots to identify changes", + Long: `Compare two cluster snapshots to identify changes. + +This command compares two snapshot files created by 'osdctl cluster snapshot' +and reports the differences. This is useful for understanding what changed +in a cluster during feature testing or validation. + +Changes are categorized as: +- added: Resource exists in after but not in before +- removed: Resource exists in before but not in after +- modified: Resource exists in both but with different values`, + Example: ` # Compare two snapshots + osdctl cluster diff before.yaml after.yaml + + # Compare snapshots with JSON output + osdctl cluster diff before.yaml after.yaml --json`, + Args: cobra.ExactArgs(2), + RunE: func(cmd *cobra.Command, args []string) error { + opts.BeforeFile = args[0] + opts.AfterFile = args[1] + return opts.run() + }, + } + + diffCmd.Flags().BoolVar(&opts.OutputJSON, "json", false, "Output diff in JSON format") + + return diffCmd +} + +func (o *diffOptions) run() error { + // Load before snapshot + beforeSnapshot, err := loadSnapshot(o.BeforeFile) + if err != nil { + return fmt.Errorf("failed to load before snapshot: %w", err) + } + + // Load after snapshot + afterSnapshot, err := loadSnapshot(o.AfterFile) + if err != nil { + return fmt.Errorf("failed to load after snapshot: %w", err) + } + + // Validate snapshots are from the same cluster + if beforeSnapshot.Metadata.ClusterID != afterSnapshot.Metadata.ClusterID { + return fmt.Errorf("snapshots are from different clusters: %s (%s) vs %s (%s)", + beforeSnapshot.Metadata.ClusterName, beforeSnapshot.Metadata.ClusterID, + afterSnapshot.Metadata.ClusterName, afterSnapshot.Metadata.ClusterID) + } + + // Compare snapshots + result := compareSnapshots(beforeSnapshot, afterSnapshot, o.BeforeFile, o.AfterFile) + + // Print results + return o.printDiff(result) +} + +func loadSnapshot(filename string) (*ClusterSnapshot, error) { + data, err := os.ReadFile(filename) + if err != nil { + return nil, err + } + + var snapshot ClusterSnapshot + if err := yaml.Unmarshal(data, &snapshot); err != nil { + return nil, err + } + + return &snapshot, nil +} + +func compareSnapshots(before, after *ClusterSnapshot, beforeFile, afterFile string) *DiffResult { + result := &DiffResult{ + BeforeSnapshot: beforeFile, + AfterSnapshot: afterFile, + ResourceChanges: make(map[string][]ResourceDiff), + } + + // Warn about capture errors that could cause false diffs + warnCaptureErrors(before, "before", beforeFile) + warnCaptureErrors(after, "after", afterFile) + + // Compare nodes + result.NodeChanges = compareNodes(before.Nodes, after.Nodes) + result.Summary.NodesChanged = len(result.NodeChanges) + + // Compare operators + result.OperatorChanges = compareOperators(before.Operators, after.Operators) + // Count unique operators changed (not individual field changes) + changedOperators := map[string]struct{}{} + for _, diff := range result.OperatorChanges { + changedOperators[diff.Name] = struct{}{} + } + result.Summary.OperatorsChanged = len(changedOperators) + + // Compare namespaces + result.NamespaceChanges = compareNamespaces(before.Namespaces, after.Namespaces) + result.Summary.NamespacesChanged = len(result.NamespaceChanges) + + // Compare resources + allResourceTypes := make(map[string]bool) + for k := range before.Resources { + allResourceTypes[k] = true + } + for k := range after.Resources { + allResourceTypes[k] = true + } + + for resourceType := range allResourceTypes { + beforeResources := before.Resources[resourceType] + afterResources := after.Resources[resourceType] + diffs := compareResources(beforeResources, afterResources) + if len(diffs) > 0 { + result.ResourceChanges[resourceType] = diffs + result.Summary.ResourcesChanged += len(diffs) + } + } + + result.Summary.TotalChanges = result.Summary.NodesChanged + + result.Summary.OperatorsChanged + + result.Summary.NamespacesChanged + + result.Summary.ResourcesChanged + + return result +} + +func compareNodes(before, after []NodeSnapshot) []NodeDiff { + var diffs []NodeDiff + + beforeMap := make(map[string]NodeSnapshot) + for _, n := range before { + beforeMap[n.Name] = n + } + + afterMap := make(map[string]NodeSnapshot) + for _, n := range after { + afterMap[n.Name] = n + } + + // Find added and modified nodes + for name, afterNode := range afterMap { + if beforeNode, exists := beforeMap[name]; !exists { + diffs = append(diffs, NodeDiff{ + Name: name, + ChangeType: "added", + After: fmt.Sprintf("Status: %s, Roles: %v, Version: %s", afterNode.Status, afterNode.Roles, afterNode.Version), + }) + } else { + // Check for any changes + var changes []string + if beforeNode.Status != afterNode.Status { + changes = append(changes, fmt.Sprintf("Status: %s -> %s", beforeNode.Status, afterNode.Status)) + } + if beforeNode.Version != afterNode.Version { + changes = append(changes, fmt.Sprintf("Version: %s -> %s", beforeNode.Version, afterNode.Version)) + } + if fmt.Sprintf("%v", beforeNode.Roles) != fmt.Sprintf("%v", afterNode.Roles) { + changes = append(changes, fmt.Sprintf("Roles: %v -> %v", beforeNode.Roles, afterNode.Roles)) + } + if len(changes) > 0 { + diffs = append(diffs, NodeDiff{ + Name: name, + ChangeType: "modified", + Before: fmt.Sprintf("Status: %s, Roles: %v, Version: %s", beforeNode.Status, beforeNode.Roles, beforeNode.Version), + After: fmt.Sprintf("Status: %s, Roles: %v, Version: %s", afterNode.Status, afterNode.Roles, afterNode.Version), + }) + } + } + } + + // Find removed nodes + for name, beforeNode := range beforeMap { + if _, exists := afterMap[name]; !exists { + diffs = append(diffs, NodeDiff{ + Name: name, + ChangeType: "removed", + Before: fmt.Sprintf("Status: %s, Roles: %v", beforeNode.Status, beforeNode.Roles), + }) + } + } + + return diffs +} + +func compareOperators(before, after []OperatorSnapshot) []OperatorDiff { + var diffs []OperatorDiff + + beforeMap := make(map[string]OperatorSnapshot) + for _, o := range before { + beforeMap[o.Name] = o + } + + afterMap := make(map[string]OperatorSnapshot) + for _, o := range after { + afterMap[o.Name] = o + } + + // Find added and modified operators + for name, afterOp := range afterMap { + if beforeOp, exists := beforeMap[name]; !exists { + diffs = append(diffs, OperatorDiff{ + Name: name, + ChangeType: "added", + After: formatOperatorStatus(afterOp), + }) + } else { + // Check for status changes + if beforeOp.Available != afterOp.Available { + diffs = append(diffs, OperatorDiff{ + Name: name, + ChangeType: "modified", + Field: "Available", + Before: fmt.Sprintf("%v", beforeOp.Available), + After: fmt.Sprintf("%v", afterOp.Available), + }) + } + if beforeOp.Degraded != afterOp.Degraded { + diffs = append(diffs, OperatorDiff{ + Name: name, + ChangeType: "modified", + Field: "Degraded", + Before: fmt.Sprintf("%v", beforeOp.Degraded), + After: fmt.Sprintf("%v", afterOp.Degraded), + }) + } + if beforeOp.Progressing != afterOp.Progressing { + diffs = append(diffs, OperatorDiff{ + Name: name, + ChangeType: "modified", + Field: "Progressing", + Before: fmt.Sprintf("%v", beforeOp.Progressing), + After: fmt.Sprintf("%v", afterOp.Progressing), + }) + } + if beforeOp.Version != afterOp.Version { + diffs = append(diffs, OperatorDiff{ + Name: name, + ChangeType: "modified", + Field: "Version", + Before: beforeOp.Version, + After: afterOp.Version, + }) + } + // Check for condition changes + if !slicesEqual(beforeOp.Conditions, afterOp.Conditions) { + diffs = append(diffs, OperatorDiff{ + Name: name, + ChangeType: "modified", + Field: "Conditions", + Before: strings.Join(beforeOp.Conditions, ", "), + After: strings.Join(afterOp.Conditions, ", "), + }) + } + } + } + + // Find removed operators + for name, beforeOp := range beforeMap { + if _, exists := afterMap[name]; !exists { + diffs = append(diffs, OperatorDiff{ + Name: name, + ChangeType: "removed", + Before: formatOperatorStatus(beforeOp), + }) + } + } + + return diffs +} + +func formatOperatorStatus(op OperatorSnapshot) string { + return fmt.Sprintf("Available=%v, Degraded=%v, Progressing=%v, Version=%s", + op.Available, op.Degraded, op.Progressing, op.Version) +} + +func compareNamespaces(before, after []NamespaceSnapshot) []NamespaceDiff { + var diffs []NamespaceDiff + + beforeMap := make(map[string]NamespaceSnapshot) + for _, n := range before { + beforeMap[n.Name] = n + } + + afterMap := make(map[string]NamespaceSnapshot) + for _, n := range after { + afterMap[n.Name] = n + } + + // Find added and modified namespaces + for name, afterNs := range afterMap { + if beforeNs, exists := beforeMap[name]; !exists { + diffs = append(diffs, NamespaceDiff{ + Name: name, + ChangeType: "added", + After: fmt.Sprintf("Status: %s", afterNs.Status), + }) + } else { + // Check for status or label changes + statusChanged := beforeNs.Status != afterNs.Status + labelsChanged := fmt.Sprintf("%v", beforeNs.Labels) != fmt.Sprintf("%v", afterNs.Labels) + if statusChanged || labelsChanged { + diffs = append(diffs, NamespaceDiff{ + Name: name, + ChangeType: "modified", + Before: fmt.Sprintf("Status: %s, Labels: %v", beforeNs.Status, beforeNs.Labels), + After: fmt.Sprintf("Status: %s, Labels: %v", afterNs.Status, afterNs.Labels), + }) + } + } + } + + // Find removed namespaces + for name, beforeNs := range beforeMap { + if _, exists := afterMap[name]; !exists { + diffs = append(diffs, NamespaceDiff{ + Name: name, + ChangeType: "removed", + Before: fmt.Sprintf("Status: %s", beforeNs.Status), + }) + } + } + + return diffs +} + +func compareResources(before, after []ResourceInfo) []ResourceDiff { + var diffs []ResourceDiff + + beforeMap := make(map[string]ResourceInfo) + for _, r := range before { + key := fmt.Sprintf("%s/%s/%s", r.Namespace, r.Kind, r.Name) + beforeMap[key] = r + } + + afterMap := make(map[string]ResourceInfo) + for _, r := range after { + key := fmt.Sprintf("%s/%s/%s", r.Namespace, r.Kind, r.Name) + afterMap[key] = r + } + + // Find added and modified resources + for key, afterRes := range afterMap { + if beforeRes, exists := beforeMap[key]; !exists { + diffs = append(diffs, ResourceDiff{ + Name: afterRes.Name, + Namespace: afterRes.Namespace, + ChangeType: "added", + After: fmt.Sprintf("Status: %s", afterRes.Status), + }) + } else if beforeRes.Status != afterRes.Status { + diffs = append(diffs, ResourceDiff{ + Name: afterRes.Name, + Namespace: afterRes.Namespace, + ChangeType: "modified", + Before: fmt.Sprintf("Status: %s", beforeRes.Status), + After: fmt.Sprintf("Status: %s", afterRes.Status), + }) + } + } + + // Find removed resources + for key, beforeRes := range beforeMap { + if _, exists := afterMap[key]; !exists { + diffs = append(diffs, ResourceDiff{ + Name: beforeRes.Name, + Namespace: beforeRes.Namespace, + ChangeType: "removed", + Before: fmt.Sprintf("Status: %s", beforeRes.Status), + }) + } + } + + return diffs +} + +func (o *diffOptions) printDiff(result *DiffResult) error { + if o.OutputJSON { + output, err := json.MarshalIndent(result, "", " ") + if err != nil { + return err + } + fmt.Println(string(output)) + return nil + } + + // Print human-readable diff + fmt.Printf("\n╔══════════════════════════════════════════════════════════════╗\n") + fmt.Printf("║ CLUSTER SNAPSHOT DIFF ║\n") + fmt.Printf("╠══════════════════════════════════════════════════════════════╣\n") + fmt.Printf("║ Before: %-54s ║\n", result.BeforeSnapshot) + fmt.Printf("║ After: %-54s ║\n", result.AfterSnapshot) + fmt.Printf("╚══════════════════════════════════════════════════════════════╝\n\n") + + fmt.Printf("SUMMARY\n") + fmt.Printf("───────\n") + fmt.Printf("Total Changes: %d\n", result.Summary.TotalChanges) + fmt.Printf("Nodes Changed: %d\n", result.Summary.NodesChanged) + fmt.Printf("Operators Changed: %d\n", result.Summary.OperatorsChanged) + fmt.Printf("Namespaces Changed: %d\n", result.Summary.NamespacesChanged) + fmt.Printf("Resources Changed: %d\n\n", result.Summary.ResourcesChanged) + + if result.Summary.TotalChanges == 0 { + fmt.Println("✓ No changes detected between snapshots.") + return nil + } + + // Print node changes + if len(result.NodeChanges) > 0 { + fmt.Println("NODE CHANGES") + fmt.Println("────────────") + for _, d := range result.NodeChanges { + printChange(d.Name, d.ChangeType, d.Before, d.After) + } + fmt.Println() + } + + // Print operator changes + if len(result.OperatorChanges) > 0 { + fmt.Println("OPERATOR CHANGES") + fmt.Println("────────────────") + for _, d := range result.OperatorChanges { + name := d.Name + if d.Field != "" { + name = fmt.Sprintf("%s [%s]", d.Name, d.Field) + } + printChange(name, d.ChangeType, d.Before, d.After) + } + fmt.Println() + } + + // Print namespace changes + if len(result.NamespaceChanges) > 0 { + fmt.Println("NAMESPACE CHANGES") + fmt.Println("─────────────────") + for _, d := range result.NamespaceChanges { + printChange(d.Name, d.ChangeType, d.Before, d.After) + } + fmt.Println() + } + + // Print resource changes + for resourceType, changes := range result.ResourceChanges { + if len(changes) > 0 { + fmt.Printf("%s CHANGES\n", strings.ToUpper(resourceType)) + fmt.Println(strings.Repeat("─", len(resourceType)+8)) + for _, d := range changes { + name := d.Name + if d.Namespace != "" { + name = fmt.Sprintf("%s/%s", d.Namespace, d.Name) + } + printChange(name, d.ChangeType, d.Before, d.After) + } + fmt.Println() + } + } + + return nil +} + +func printChange(name, changeType, before, after string) { + var symbol string + switch changeType { + case "added": + symbol = "+" + case "removed": + symbol = "-" + case "modified": + symbol = "~" + } + + fmt.Printf(" %s %s\n", symbol, name) + if before != "" { + fmt.Printf(" Before: %s\n", before) + } + if after != "" { + fmt.Printf(" After: %s\n", after) + } +} + +// slicesEqual compares two string slices for equality +func slicesEqual(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +// warnCaptureErrors prints warnings about capture errors that could cause false diffs +func warnCaptureErrors(snapshot *ClusterSnapshot, label, filename string) { + if len(snapshot.Metadata.CaptureErrors) > 0 { + fmt.Fprintf(os.Stderr, "[WARN] %s snapshot (%s) has capture errors:\n", label, filename) + for section, errMsg := range snapshot.Metadata.CaptureErrors { + fmt.Fprintf(os.Stderr, " - %s: %s\n", section, errMsg) + } + fmt.Fprintln(os.Stderr, "[WARN] Diff results for failed sections may show false additions/removals") + } +} diff --git a/cmd/cluster/snapshot.go b/cmd/cluster/snapshot.go new file mode 100644 index 000000000..263d5b546 --- /dev/null +++ b/cmd/cluster/snapshot.go @@ -0,0 +1,454 @@ +package cluster + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/openshift/osdctl/pkg/utils" + "github.com/spf13/cobra" + "gopkg.in/yaml.v3" + cmdutil "k8s.io/kubectl/pkg/cmd/util" +) + +// ClusterSnapshot represents a point-in-time capture of cluster state +type ClusterSnapshot struct { + Metadata SnapshotMetadata `yaml:"metadata"` + Namespaces []NamespaceSnapshot `yaml:"namespaces,omitempty"` + Nodes []NodeSnapshot `yaml:"nodes,omitempty"` + Operators []OperatorSnapshot `yaml:"operators,omitempty"` + Resources map[string][]ResourceInfo `yaml:"resources,omitempty"` +} + +// SnapshotMetadata contains information about when/how the snapshot was taken +type SnapshotMetadata struct { + ClusterID string `yaml:"clusterId"` + ClusterName string `yaml:"clusterName"` + Timestamp time.Time `yaml:"timestamp"` + Version string `yaml:"version"` + Platform string `yaml:"platform"` + IsHCP bool `yaml:"isHCP"` + CaptureErrors map[string]string `yaml:"captureErrors,omitempty"` +} + +// NamespaceSnapshot captures namespace state +type NamespaceSnapshot struct { + Name string `yaml:"name"` + Status string `yaml:"status"` + Labels map[string]string `yaml:"labels,omitempty"` +} + +// NodeSnapshot captures node state +type NodeSnapshot struct { + Name string `yaml:"name"` + Status string `yaml:"status"` + Roles []string `yaml:"roles,omitempty"` + Version string `yaml:"version"` + Conditions []string `yaml:"conditions,omitempty"` + Labels map[string]string `yaml:"labels,omitempty"` +} + +// OperatorSnapshot captures ClusterOperator state +type OperatorSnapshot struct { + Name string `yaml:"name"` + Available bool `yaml:"available"` + Progressing bool `yaml:"progressing"` + Degraded bool `yaml:"degraded"` + Version string `yaml:"version,omitempty"` + Conditions []string `yaml:"conditions,omitempty"` +} + +// ResourceInfo captures basic resource information +type ResourceInfo struct { + Name string `yaml:"name"` + Namespace string `yaml:"namespace,omitempty"` + Kind string `yaml:"kind"` + Status string `yaml:"status,omitempty"` +} + +// snapshotOptions holds the options for the snapshot command +type snapshotOptions struct { + ClusterID string + OutputFile string + IncludeSecrets bool + Namespaces []string + ResourceTypes []string +} + +func newCmdSnapshot() *cobra.Command { + opts := &snapshotOptions{} + + snapshotCmd := &cobra.Command{ + Use: "snapshot", + Short: "Capture a point-in-time snapshot of cluster state", + Long: `Capture a point-in-time snapshot of cluster state for evidence collection. + +This command captures the current state of key cluster resources including: +- Namespace states +- Node conditions and readiness +- ClusterOperator status +- Custom resources (optional) + +The snapshot can be saved to a YAML file and later compared using +'osdctl cluster diff' to identify changes during feature testing.`, + Example: ` # Capture cluster snapshot to a file + osdctl cluster snapshot -C -o before.yaml + + # Capture snapshot with specific namespaces + osdctl cluster snapshot -C -o snapshot.yaml --namespaces openshift-monitoring,openshift-operators + + # Capture additional resource types + osdctl cluster snapshot -C -o snapshot.yaml --resources pods,deployments,services`, + RunE: func(cmd *cobra.Command, args []string) error { + return opts.run() + }, + } + + snapshotCmd.Flags().StringVarP(&opts.ClusterID, "cluster-id", "C", "", "Cluster ID (internal, external, or name)") + snapshotCmd.Flags().StringVarP(&opts.OutputFile, "output", "o", "", "Output file path (YAML format)") + snapshotCmd.Flags().StringSliceVar(&opts.Namespaces, "namespaces", []string{}, "Specific namespaces to include (default: all openshift-* namespaces)") + snapshotCmd.Flags().StringSliceVar(&opts.ResourceTypes, "resources", []string{}, "Additional resource types to capture (e.g., pods,deployments)") + cmdutil.CheckErr(snapshotCmd.MarkFlagRequired("cluster-id")) + cmdutil.CheckErr(snapshotCmd.MarkFlagRequired("output")) + + return snapshotCmd +} + +func (o *snapshotOptions) run() error { + if err := utils.IsValidClusterKey(o.ClusterID); err != nil { + return err + } + + connection, err := utils.CreateConnection() + if err != nil { + return fmt.Errorf("unable to create connection to OCM: %w", err) + } + defer connection.Close() + + cluster, err := utils.GetClusterAnyStatus(connection, o.ClusterID) + if err != nil { + return err + } + + isHCP := cluster.Hypershift().Enabled() + clusterType := "Classic" + if isHCP { + clusterType = "HCP (Hosted Control Plane)" + } + fmt.Printf("[INFO] Creating snapshot for cluster: %s (%s) - %s\n", cluster.Name(), cluster.ID(), clusterType) + + captureErrors := make(map[string]string) + + snapshot := &ClusterSnapshot{ + Metadata: SnapshotMetadata{ + ClusterID: cluster.ID(), + ClusterName: cluster.Name(), + Timestamp: time.Now().UTC(), + Version: cluster.OpenshiftVersion(), + Platform: cluster.CloudProvider().ID(), + IsHCP: isHCP, + }, + Resources: make(map[string][]ResourceInfo), + } + + if isHCP { + fmt.Println("[INFO] HCP cluster detected - note that only worker nodes will be visible") + } + + // Capture nodes + fmt.Println("[INFO] Capturing node states...") + nodes, err := o.captureNodes() + if err != nil { + fmt.Printf("[WARN] Failed to capture nodes: %v\n", err) + captureErrors["nodes"] = err.Error() + } else { + snapshot.Nodes = nodes + } + + // Capture namespaces + fmt.Println("[INFO] Capturing namespace states...") + namespaces, err := o.captureNamespaces() + if err != nil { + fmt.Printf("[WARN] Failed to capture namespaces: %v\n", err) + captureErrors["namespaces"] = err.Error() + } else { + snapshot.Namespaces = namespaces + } + + // Capture cluster operators + fmt.Println("[INFO] Capturing ClusterOperator states...") + operators, err := o.captureClusterOperators() + if err != nil { + fmt.Printf("[WARN] Failed to capture cluster operators: %v\n", err) + captureErrors["operators"] = err.Error() + } else { + snapshot.Operators = operators + } + + // Capture additional resources if specified + for _, resourceType := range o.ResourceTypes { + fmt.Printf("[INFO] Capturing %s...\n", resourceType) + resources, err := o.captureResources(resourceType) + if err != nil { + fmt.Printf("[WARN] Failed to capture %s: %v\n", resourceType, err) + captureErrors[resourceType] = err.Error() + continue + } + snapshot.Resources[resourceType] = resources + } + + // Store capture errors in metadata + if len(captureErrors) > 0 { + snapshot.Metadata.CaptureErrors = captureErrors + } + + // Fail if all core sections failed + if len(snapshot.Nodes) == 0 && len(snapshot.Namespaces) == 0 && len(snapshot.Operators) == 0 { + if len(captureErrors) > 0 { + return fmt.Errorf("failed to capture any cluster state: %v", captureErrors) + } + } + + // Write snapshot to file + if err := o.writeSnapshot(snapshot); err != nil { + return fmt.Errorf("failed to write snapshot: %w", err) + } + + fmt.Printf("[INFO] Snapshot saved to: %s\n", o.OutputFile) + return nil +} + +func (o *snapshotOptions) captureNodes() ([]NodeSnapshot, error) { + output, err := exec.CommandContext(context.TODO(), "oc", "get", "nodes", "-o", "json").CombinedOutput() + if err != nil { + return nil, fmt.Errorf("oc get nodes failed: %w: %s", err, strings.TrimSpace(string(output))) + } + + var result struct { + Items []struct { + Metadata struct { + Name string `json:"name"` + Labels map[string]string `json:"labels"` + } `json:"metadata"` + Status struct { + Conditions []struct { + Type string `json:"type"` + Status string `json:"status"` + } `json:"conditions"` + NodeInfo struct { + KubeletVersion string `json:"kubeletVersion"` + } `json:"nodeInfo"` + } `json:"status"` + } `json:"items"` + } + + if err := yaml.Unmarshal(output, &result); err != nil { + return nil, err + } + + var nodes []NodeSnapshot + for _, item := range result.Items { + node := NodeSnapshot{ + Name: item.Metadata.Name, + Version: item.Status.NodeInfo.KubeletVersion, + Labels: item.Metadata.Labels, + } + + // Extract roles from labels + for label := range item.Metadata.Labels { + if strings.HasPrefix(label, "node-role.kubernetes.io/") { + role := strings.TrimPrefix(label, "node-role.kubernetes.io/") + node.Roles = append(node.Roles, role) + } + } + + // Check node conditions + for _, cond := range item.Status.Conditions { + if cond.Type == "Ready" { + if cond.Status == "True" { + node.Status = "Ready" + } else { + node.Status = "NotReady" + } + } + node.Conditions = append(node.Conditions, fmt.Sprintf("%s=%s", cond.Type, cond.Status)) + } + + nodes = append(nodes, node) + } + + return nodes, nil +} + +func (o *snapshotOptions) captureNamespaces() ([]NamespaceSnapshot, error) { + args := []string{"get", "namespaces", "-o", "json"} + + output, err := exec.CommandContext(context.TODO(), "oc", args...).CombinedOutput() //#nosec G204 -- args are constructed from trusted input + if err != nil { + return nil, fmt.Errorf("oc get namespaces failed: %w: %s", err, strings.TrimSpace(string(output))) + } + + var result struct { + Items []struct { + Metadata struct { + Name string `json:"name"` + Labels map[string]string `json:"labels"` + } `json:"metadata"` + Status struct { + Phase string `json:"phase"` + } `json:"status"` + } `json:"items"` + } + + if err := yaml.Unmarshal(output, &result); err != nil { + return nil, err + } + + var namespaces []NamespaceSnapshot + for _, item := range result.Items { + // Filter to openshift-* namespaces if no specific namespaces provided + if len(o.Namespaces) == 0 { + if !strings.HasPrefix(item.Metadata.Name, "openshift-") { + continue + } + } else { + found := false + for _, ns := range o.Namespaces { + if item.Metadata.Name == ns { + found = true + break + } + } + if !found { + continue + } + } + + namespaces = append(namespaces, NamespaceSnapshot{ + Name: item.Metadata.Name, + Status: item.Status.Phase, + Labels: item.Metadata.Labels, + }) + } + + return namespaces, nil +} + +func (o *snapshotOptions) captureClusterOperators() ([]OperatorSnapshot, error) { + output, err := exec.CommandContext(context.TODO(), "oc", "get", "clusteroperators", "-o", "json").CombinedOutput() + if err != nil { + return nil, fmt.Errorf("oc get clusteroperators failed: %w: %s", err, strings.TrimSpace(string(output))) + } + + var result struct { + Items []struct { + Metadata struct { + Name string `json:"name"` + } `json:"metadata"` + Status struct { + Conditions []struct { + Type string `json:"type"` + Status string `json:"status"` + } `json:"conditions"` + Versions []struct { + Name string `json:"name"` + Version string `json:"version"` + } `json:"versions"` + } `json:"status"` + } `json:"items"` + } + + if err := yaml.Unmarshal(output, &result); err != nil { + return nil, err + } + + var operators []OperatorSnapshot + for _, item := range result.Items { + operator := OperatorSnapshot{ + Name: item.Metadata.Name, + } + + for _, cond := range item.Status.Conditions { + operator.Conditions = append(operator.Conditions, fmt.Sprintf("%s=%s", cond.Type, cond.Status)) + switch cond.Type { + case "Available": + operator.Available = cond.Status == "True" + case "Progressing": + operator.Progressing = cond.Status == "True" + case "Degraded": + operator.Degraded = cond.Status == "True" + } + } + + for _, ver := range item.Status.Versions { + if ver.Name == "operator" { + operator.Version = ver.Version + break + } + } + + operators = append(operators, operator) + } + + return operators, nil +} + +func (o *snapshotOptions) captureResources(resourceType string) ([]ResourceInfo, error) { + output, err := exec.CommandContext(context.TODO(), "oc", "get", resourceType, "--all-namespaces", "-o", "json").CombinedOutput() //#nosec G204 -- resourceType is user-provided but filtered + if err != nil { + return nil, fmt.Errorf("oc get %s failed: %w: %s", resourceType, err, strings.TrimSpace(string(output))) + } + + var result struct { + Items []struct { + Metadata struct { + Name string `json:"name"` + Namespace string `json:"namespace"` + } `json:"metadata"` + Status struct { + Phase string `json:"phase"` + } `json:"status"` + } `json:"items"` + } + + if err := yaml.Unmarshal(output, &result); err != nil { + return nil, err + } + + var resources []ResourceInfo + for _, item := range result.Items { + resources = append(resources, ResourceInfo{ + Name: item.Metadata.Name, + Namespace: item.Metadata.Namespace, + Kind: resourceType, + Status: item.Status.Phase, + }) + } + + return resources, nil +} + +func (o *snapshotOptions) writeSnapshot(snapshot *ClusterSnapshot) error { + // Ensure directory exists + dir := filepath.Dir(o.OutputFile) + if dir != "" && dir != "." { + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("failed to create directory: %w", err) + } + } + + data, err := yaml.Marshal(snapshot) + if err != nil { + return fmt.Errorf("failed to marshal snapshot: %w", err) + } + + if err := os.WriteFile(o.OutputFile, data, 0600); err != nil { + return fmt.Errorf("failed to write file: %w", err) + } + + return nil +} diff --git a/cmd/cmd.go b/cmd/cmd.go index 7a54b9448..af84635ea 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -25,6 +25,7 @@ import ( "github.com/openshift/osdctl/cmd/cost" "github.com/openshift/osdctl/cmd/dynatrace" "github.com/openshift/osdctl/cmd/env" + "github.com/openshift/osdctl/cmd/evidence" "github.com/openshift/osdctl/cmd/hcp" "github.com/openshift/osdctl/cmd/hive" "github.com/openshift/osdctl/cmd/iampermissions" @@ -93,6 +94,7 @@ func NewCmdRoot(streams genericclioptions.IOStreams) *cobra.Command { rootCmd.AddCommand(cloudtrail.NewCloudtrailCmd()) rootCmd.AddCommand(cluster.NewCmdCluster(streams, kubeClient, globalOpts)) rootCmd.AddCommand(env.NewCmdEnv()) + rootCmd.AddCommand(evidence.NewCmdEvidence()) rootCmd.AddCommand(hive.NewCmdHive(streams, kubeClient)) rootCmd.AddCommand(jira.Cmd) rootCmd.AddCommand(jumphost.NewCmdJumphost()) diff --git a/cmd/evidence/cmd.go b/cmd/evidence/cmd.go new file mode 100644 index 000000000..199038bd5 --- /dev/null +++ b/cmd/evidence/cmd.go @@ -0,0 +1,25 @@ +package evidence + +import ( + "github.com/spf13/cobra" +) + +// NewCmdEvidence returns the evidence command group +func NewCmdEvidence() *cobra.Command { + evidenceCmd := &cobra.Command{ + Use: "evidence", + Short: "Evidence collection utilities for feature testing", + Long: `Evidence collection utilities for feature testing. + +This command group provides tools to help SRE teams collect evidence +during feature validation testing. The collected evidence can include +CloudTrail logs, cluster snapshots, and other diagnostic information.`, + Run: func(cmd *cobra.Command, args []string) { + _ = cmd.Help() + }, + } + + evidenceCmd.AddCommand(newCmdCollect()) + + return evidenceCmd +} diff --git a/cmd/evidence/collect.go b/cmd/evidence/collect.go new file mode 100644 index 000000000..cad460f60 --- /dev/null +++ b/cmd/evidence/collect.go @@ -0,0 +1,707 @@ +package evidence + +import ( + "context" + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/openshift/osdctl/pkg/osdCloud" + "github.com/openshift/osdctl/pkg/utils" + "github.com/spf13/cobra" + "gopkg.in/yaml.v3" + cmdutil "k8s.io/kubectl/pkg/cmd/util" +) + +// EvidenceCollection represents all collected evidence +type EvidenceCollection struct { + Metadata CollectionMetadata `yaml:"metadata"` + ClusterState *ClusterState `yaml:"clusterState,omitempty"` + CloudTrailData *CloudTrailData `yaml:"cloudTrailData,omitempty"` + Diagnostics *DiagnosticData `yaml:"diagnostics,omitempty"` +} + +// CollectionMetadata contains information about the evidence collection +type CollectionMetadata struct { + ClusterID string `yaml:"clusterId"` + ClusterName string `yaml:"clusterName"` + CollectionTime time.Time `yaml:"collectionTime"` + CollectorUser string `yaml:"collectorUser,omitempty"` + TimeWindowStart time.Time `yaml:"timeWindowStart"` + Platform string `yaml:"platform"` + IsHCP bool `yaml:"isHCP"` +} + +// ClusterState captures cluster resource states +type ClusterState struct { + Nodes []NodeInfo `yaml:"nodes,omitempty"` + Operators []OperatorInfo `yaml:"operators,omitempty"` + MachineConfigs []MachineConfigInfo `yaml:"machineConfigs,omitempty"` + Events []EventInfo `yaml:"events,omitempty"` +} + +// NodeInfo represents node state +type NodeInfo struct { + Name string `yaml:"name"` + Status string `yaml:"status"` + Roles []string `yaml:"roles"` + Conditions []string `yaml:"conditions,omitempty"` +} + +// OperatorInfo represents ClusterOperator state +type OperatorInfo struct { + Name string `yaml:"name"` + Available bool `yaml:"available"` + Progressing bool `yaml:"progressing"` + Degraded bool `yaml:"degraded"` + Version string `yaml:"version,omitempty"` +} + +// MachineConfigInfo represents MachineConfig state +type MachineConfigInfo struct { + Name string `yaml:"name"` + Created string `yaml:"created"` +} + +// EventInfo represents Kubernetes events +type EventInfo struct { + Type string `yaml:"type"` + Reason string `yaml:"reason"` + Message string `yaml:"message"` + Namespace string `yaml:"namespace"` + Object string `yaml:"object"` + Timestamp string `yaml:"timestamp"` +} + +// CloudTrailData contains CloudTrail event information +type CloudTrailData struct { + ErrorEvents []CloudTrailError `yaml:"errorEvents,omitempty"` + WriteEvents []CloudTrailEvent `yaml:"writeEvents,omitempty"` +} + +// CloudTrailError represents an AWS error event +type CloudTrailError struct { + EventTime string `yaml:"eventTime"` + EventName string `yaml:"eventName"` + ErrorCode string `yaml:"errorCode"` + ErrorMsg string `yaml:"errorMessage,omitempty"` + Username string `yaml:"username,omitempty"` + Region string `yaml:"region"` + ConsoleLink string `yaml:"consoleLink,omitempty"` +} + +// CloudTrailEvent represents an AWS API event +type CloudTrailEvent struct { + EventTime string `yaml:"eventTime"` + EventName string `yaml:"eventName"` + Username string `yaml:"username,omitempty"` + Region string `yaml:"region"` +} + +// DiagnosticData contains diagnostic commands output +type DiagnosticData struct { + MustGatherPath string `yaml:"mustGatherPath,omitempty"` + CustomCommands map[string]string `yaml:"customCommands,omitempty"` +} + +// RawEventDetails represents CloudTrail event structure +type RawEventDetails struct { + EventVersion string `json:"eventVersion"` + UserIdentity struct { + AccountId string `json:"accountId"` + SessionContext struct { + SessionIssuer struct { + Type string `json:"type"` + UserName string `json:"userName"` + Arn string `json:"arn"` + } `json:"sessionIssuer"` + } `json:"sessionContext"` + } `json:"userIdentity"` + EventRegion string `json:"awsRegion"` + EventId string `json:"eventID"` + ErrorCode string `json:"errorCode"` + ErrorMessage string `json:"errorMessage"` +} + +// collectOptions holds the options for the collect command +type collectOptions struct { + ClusterID string + OutputDir string + Since string + IncludeEvents bool + IncludeMustGather bool + SkipCloudTrail bool + SkipClusterState bool +} + +func newCmdCollect() *cobra.Command { + opts := &collectOptions{} + + collectCmd := &cobra.Command{ + Use: "collect", + Short: "Collect evidence from cluster and AWS for feature testing", + Long: `Collect comprehensive evidence from a cluster and AWS for feature testing. + +This all-in-one command gathers: +- Cluster state (nodes, operators, machine configs) +- CloudTrail error events (permission denied, etc.) +- Recent Kubernetes events (optional) +- must-gather output (optional) + +The collected evidence is saved to the specified output directory for +inclusion in test reports and feature validation documentation.`, + Example: ` # Collect all evidence to a directory + osdctl evidence collect -C --output ./evidence/ + + # Collect evidence from the last 2 hours + osdctl evidence collect -C --output ./evidence/ --since 2h + + # Collect evidence without CloudTrail (for non-AWS or limited access) + osdctl evidence collect -C --output ./evidence/ --skip-cloudtrail + + # Include Kubernetes events in collection + osdctl evidence collect -C --output ./evidence/ --include-events`, + RunE: func(cmd *cobra.Command, args []string) error { + return opts.run() + }, + } + + collectCmd.Flags().StringVarP(&opts.ClusterID, "cluster-id", "C", "", "Cluster ID (internal, external, or name)") + collectCmd.Flags().StringVarP(&opts.OutputDir, "output", "o", "", "Output directory for collected evidence") + collectCmd.Flags().StringVar(&opts.Since, "since", "1h", "Time window to look back for events (e.g., 30m, 1h, 2h)") + collectCmd.Flags().BoolVar(&opts.IncludeEvents, "include-events", false, "Include Kubernetes events in collection") + collectCmd.Flags().BoolVar(&opts.IncludeMustGather, "include-must-gather", false, "Run must-gather and include output") + collectCmd.Flags().BoolVar(&opts.SkipCloudTrail, "skip-cloudtrail", false, "Skip CloudTrail event collection") + collectCmd.Flags().BoolVar(&opts.SkipClusterState, "skip-cluster-state", false, "Skip cluster state collection") + cmdutil.CheckErr(collectCmd.MarkFlagRequired("cluster-id")) + cmdutil.CheckErr(collectCmd.MarkFlagRequired("output")) + + return collectCmd +} + +func (o *collectOptions) run() error { + if err := utils.IsValidClusterKey(o.ClusterID); err != nil { + return err + } + + connection, err := utils.CreateConnection() + if err != nil { + return fmt.Errorf("unable to create connection to OCM: %w", err) + } + defer connection.Close() + + cluster, err := utils.GetClusterAnyStatus(connection, o.ClusterID) + if err != nil { + return err + } + + // Create output directory + if err := os.MkdirAll(o.OutputDir, 0755); err != nil { + return fmt.Errorf("failed to create output directory: %w", err) + } + + startTime, err := parseDurationToUTC(o.Since) + if err != nil { + return fmt.Errorf("invalid time duration: %w", err) + } + + isHCP := cluster.Hypershift().Enabled() + clusterType := "Classic" + if isHCP { + clusterType = "HCP" + } + + fmt.Printf("╔══════════════════════════════════════════════════════════════╗\n") + fmt.Printf("║ EVIDENCE COLLECTION ║\n") + fmt.Printf("╠══════════════════════════════════════════════════════════════╣\n") + fmt.Printf("║ Cluster: %-53s ║\n", cluster.Name()) + fmt.Printf("║ ID: %-53s ║\n", cluster.ID()) + fmt.Printf("║ Type: %-53s ║\n", clusterType) + fmt.Printf("║ Since: %-53s ║\n", startTime.Format(time.RFC3339)) + fmt.Printf("╚══════════════════════════════════════════════════════════════╝\n\n") + + if isHCP { + fmt.Println("ℹ️ HCP cluster detected - CloudTrail events will only show customer account activity") + fmt.Println(" Control plane activity is in Red Hat's account and not visible here.") + } + + // Verify we're connected to the correct cluster + if err := o.verifyClusterContext(cluster.ID()); err != nil { + fmt.Printf("⚠️ Warning: %v\n", err) + fmt.Println(" Please ensure you're logged into the correct cluster via 'ocm backplane login'") + fmt.Println(" Continuing anyway - collected data may be from a different cluster!") + } + + evidence := &EvidenceCollection{ + Metadata: CollectionMetadata{ + ClusterID: cluster.ID(), + ClusterName: cluster.Name(), + CollectionTime: time.Now().UTC(), + TimeWindowStart: startTime, + Platform: cluster.CloudProvider().ID(), + IsHCP: isHCP, + }, + } + + // Collect cluster state + if !o.SkipClusterState { + fmt.Println("📋 Collecting cluster state...") + clusterState, err := o.collectClusterState() + if err != nil { + fmt.Printf(" ⚠️ Warning: Failed to collect cluster state: %v\n", err) + } else { + evidence.ClusterState = clusterState + fmt.Printf(" ✓ Collected state for %d nodes, %d operators\n", + len(clusterState.Nodes), len(clusterState.Operators)) + } + } + + // Collect CloudTrail data for AWS clusters + if !o.SkipCloudTrail && strings.ToUpper(cluster.CloudProvider().ID()) == "AWS" { + fmt.Println("☁️ Collecting CloudTrail data...") + // Verify AWS access is available + _, err := osdCloud.CreateAWSV2Config(connection, cluster) + if err != nil { + fmt.Printf(" ⚠️ Warning: Failed to create AWS config: %v\n", err) + } else { + cloudTrailData, err := o.collectCloudTrailData(startTime) + if err != nil { + fmt.Printf(" ⚠️ Warning: Failed to collect CloudTrail data: %v\n", err) + } else { + evidence.CloudTrailData = cloudTrailData + fmt.Println(" ✓ CloudTrail access verified. Use 'osdctl cloudtrail errors' for detailed error analysis.") + } + } + } + + // Include Kubernetes events + if o.IncludeEvents { + fmt.Println("📅 Collecting Kubernetes events...") + events, err := o.collectKubernetesEvents(startTime) + if err != nil { + fmt.Printf(" ⚠️ Warning: Failed to collect events: %v\n", err) + } else { + if evidence.ClusterState == nil { + evidence.ClusterState = &ClusterState{} + } + evidence.ClusterState.Events = events + fmt.Printf(" ✓ Collected %d events\n", len(events)) + } + } + + // Run must-gather if requested + if o.IncludeMustGather { + fmt.Println("📦 Running must-gather...") + mustGatherPath, err := o.runMustGather() + if err != nil { + fmt.Printf(" ⚠️ Warning: must-gather failed: %v\n", err) + } else { + if evidence.Diagnostics == nil { + evidence.Diagnostics = &DiagnosticData{} + } + evidence.Diagnostics.MustGatherPath = mustGatherPath + fmt.Printf(" ✓ must-gather saved to: %s\n", mustGatherPath) + } + } + + // Write evidence to files + fmt.Println("\n💾 Saving evidence...") + + // Save main evidence file + evidenceFile := filepath.Join(o.OutputDir, "evidence.yaml") + if err := o.saveEvidence(evidence, evidenceFile); err != nil { + return fmt.Errorf("failed to save evidence: %w", err) + } + fmt.Printf(" ✓ Evidence saved to: %s\n", evidenceFile) + + // Save summary + summaryFile := filepath.Join(o.OutputDir, "summary.txt") + if err := o.saveSummary(evidence, summaryFile); err != nil { + fmt.Printf(" ⚠️ Warning: Failed to save summary: %v\n", err) + } else { + fmt.Printf(" ✓ Summary saved to: %s\n", summaryFile) + } + + fmt.Printf("\n✅ Evidence collection complete!\n") + fmt.Printf(" Output directory: %s\n", o.OutputDir) + + return nil +} + +func (o *collectOptions) collectClusterState() (*ClusterState, error) { + state := &ClusterState{} + var warnings []string + + // Collect nodes + nodes, err := o.collectNodes() + if err != nil { + warnings = append(warnings, fmt.Sprintf("nodes: %v", err)) + } else { + state.Nodes = nodes + } + + // Collect operators + operators, err := o.collectOperators() + if err != nil { + warnings = append(warnings, fmt.Sprintf("operators: %v", err)) + } else { + state.Operators = operators + } + + // Collect machine configs + machineConfigs, err := o.collectMachineConfigs() + if err != nil { + warnings = append(warnings, fmt.Sprintf("machineconfigs: %v", err)) + } else { + state.MachineConfigs = machineConfigs + } + + // Return error if all collections failed + if len(state.Nodes) == 0 && len(state.Operators) == 0 && len(state.MachineConfigs) == 0 && len(warnings) > 0 { + return nil, fmt.Errorf("all collections failed: %v", warnings) + } + + // Print warnings for partial failures + for _, w := range warnings { + fmt.Printf(" ⚠️ Warning: Failed to collect %s\n", w) + } + + return state, nil +} + +func (o *collectOptions) collectNodes() ([]NodeInfo, error) { + output, err := exec.CommandContext(context.TODO(), "oc", "get", "nodes", "-o", "json").Output() + if err != nil { + return nil, err + } + + var result struct { + Items []struct { + Metadata struct { + Name string `json:"name"` + Labels map[string]string `json:"labels"` + } `json:"metadata"` + Status struct { + Conditions []struct { + Type string `json:"type"` + Status string `json:"status"` + } `json:"conditions"` + } `json:"status"` + } `json:"items"` + } + + if err := json.Unmarshal(output, &result); err != nil { + return nil, err + } + + var nodes []NodeInfo + for _, item := range result.Items { + node := NodeInfo{ + Name: item.Metadata.Name, + } + + // Extract roles + for label := range item.Metadata.Labels { + if strings.HasPrefix(label, "node-role.kubernetes.io/") { + role := strings.TrimPrefix(label, "node-role.kubernetes.io/") + node.Roles = append(node.Roles, role) + } + } + + // Check conditions + for _, cond := range item.Status.Conditions { + if cond.Type == "Ready" { + if cond.Status == "True" { + node.Status = "Ready" + } else { + node.Status = "NotReady" + } + } + node.Conditions = append(node.Conditions, fmt.Sprintf("%s=%s", cond.Type, cond.Status)) + } + + nodes = append(nodes, node) + } + + return nodes, nil +} + +func (o *collectOptions) collectOperators() ([]OperatorInfo, error) { + output, err := exec.CommandContext(context.TODO(), "oc", "get", "clusteroperators", "-o", "json").Output() + if err != nil { + return nil, err + } + + var result struct { + Items []struct { + Metadata struct { + Name string `json:"name"` + } `json:"metadata"` + Status struct { + Conditions []struct { + Type string `json:"type"` + Status string `json:"status"` + } `json:"conditions"` + Versions []struct { + Name string `json:"name"` + Version string `json:"version"` + } `json:"versions"` + } `json:"status"` + } `json:"items"` + } + + if err := json.Unmarshal(output, &result); err != nil { + return nil, err + } + + var operators []OperatorInfo + for _, item := range result.Items { + operator := OperatorInfo{ + Name: item.Metadata.Name, + } + + for _, cond := range item.Status.Conditions { + switch cond.Type { + case "Available": + operator.Available = cond.Status == "True" + case "Progressing": + operator.Progressing = cond.Status == "True" + case "Degraded": + operator.Degraded = cond.Status == "True" + } + } + + for _, ver := range item.Status.Versions { + if ver.Name == "operator" { + operator.Version = ver.Version + break + } + } + + operators = append(operators, operator) + } + + return operators, nil +} + +func (o *collectOptions) collectMachineConfigs() ([]MachineConfigInfo, error) { + output, err := exec.CommandContext(context.TODO(), "oc", "get", "machineconfigs", "-o", "json").Output() + if err != nil { + return nil, err + } + + var result struct { + Items []struct { + Metadata struct { + Name string `json:"name"` + CreationTimestamp string `json:"creationTimestamp"` + } `json:"metadata"` + } `json:"items"` + } + + if err := json.Unmarshal(output, &result); err != nil { + return nil, err + } + + var configs []MachineConfigInfo + for _, item := range result.Items { + configs = append(configs, MachineConfigInfo{ + Name: item.Metadata.Name, + Created: item.Metadata.CreationTimestamp, + }) + } + + return configs, nil +} + +func (o *collectOptions) collectKubernetesEvents(startTime time.Time) ([]EventInfo, error) { + output, err := exec.CommandContext(context.TODO(), "oc", "get", "events", "--all-namespaces", "-o", "json").Output() + if err != nil { + return nil, err + } + + var result struct { + Items []struct { + Type string `json:"type"` + Reason string `json:"reason"` + Message string `json:"message"` + Metadata struct { + Namespace string `json:"namespace"` + } `json:"metadata"` + InvolvedObject struct { + Kind string `json:"kind"` + Name string `json:"name"` + } `json:"involvedObject"` + LastTimestamp string `json:"lastTimestamp"` + } `json:"items"` + } + + if err := json.Unmarshal(output, &result); err != nil { + return nil, err + } + + var events []EventInfo + for _, item := range result.Items { + // Filter events by startTime + if item.LastTimestamp != "" { + eventTime, err := time.Parse(time.RFC3339, item.LastTimestamp) + if err == nil && eventTime.Before(startTime) { + continue // Skip events older than startTime + } + } + events = append(events, EventInfo{ + Type: item.Type, + Reason: item.Reason, + Message: item.Message, + Namespace: item.Metadata.Namespace, + Object: fmt.Sprintf("%s/%s", item.InvolvedObject.Kind, item.InvolvedObject.Name), + Timestamp: item.LastTimestamp, + }) + } + + return events, nil +} + +func (o *collectOptions) collectCloudTrailData(startTime time.Time) (*CloudTrailData, error) { + // Note: Full CloudTrail error data collection is handled by 'osdctl cloudtrail errors' + // This function verifies AWS access is available + // For detailed CloudTrail analysis, use: osdctl cloudtrail errors -C --since + _ = startTime + + // Return nil to indicate CloudTrail was not collected (use dedicated command for details) + return nil, nil +} + +func (o *collectOptions) runMustGather() (string, error) { + mustGatherDir := filepath.Join(o.OutputDir, "must-gather") + if err := os.MkdirAll(mustGatherDir, 0755); err != nil { + return "", err + } + + cmd := exec.CommandContext(context.TODO(), "oc", "adm", "must-gather", "--dest-dir", mustGatherDir) //#nosec G204 -- command args are trusted + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + if err := cmd.Run(); err != nil { + return "", err + } + + return mustGatherDir, nil +} + +func (o *collectOptions) saveEvidence(evidence *EvidenceCollection, filename string) error { + data, err := yaml.Marshal(evidence) + if err != nil { + return err + } + + return os.WriteFile(filename, data, 0600) +} + +func (o *collectOptions) saveSummary(evidence *EvidenceCollection, filename string) error { + var sb strings.Builder + + sb.WriteString("EVIDENCE COLLECTION SUMMARY\n") + sb.WriteString("===========================\n\n") + sb.WriteString(fmt.Sprintf("Cluster: %s (%s)\n", evidence.Metadata.ClusterName, evidence.Metadata.ClusterID)) + sb.WriteString(fmt.Sprintf("Platform: %s\n", evidence.Metadata.Platform)) + sb.WriteString(fmt.Sprintf("Collection Time: %s\n", evidence.Metadata.CollectionTime.Format(time.RFC3339))) + sb.WriteString(fmt.Sprintf("Time Window Start: %s\n\n", evidence.Metadata.TimeWindowStart.Format(time.RFC3339))) + + if evidence.ClusterState != nil { + sb.WriteString("CLUSTER STATE\n") + sb.WriteString("-------------\n") + sb.WriteString(fmt.Sprintf("Nodes: %d\n", len(evidence.ClusterState.Nodes))) + sb.WriteString(fmt.Sprintf("Operators: %d\n", len(evidence.ClusterState.Operators))) + sb.WriteString(fmt.Sprintf("MachineConfigs: %d\n", len(evidence.ClusterState.MachineConfigs))) + sb.WriteString(fmt.Sprintf("Events: %d\n\n", len(evidence.ClusterState.Events))) + + // Count degraded operators + degradedCount := 0 + for _, op := range evidence.ClusterState.Operators { + if op.Degraded { + degradedCount++ + } + } + if degradedCount > 0 { + sb.WriteString(fmt.Sprintf("⚠️ Degraded Operators: %d\n", degradedCount)) + for _, op := range evidence.ClusterState.Operators { + if op.Degraded { + sb.WriteString(fmt.Sprintf(" - %s\n", op.Name)) + } + } + sb.WriteString("\n") + } + + // Count not ready nodes + notReadyCount := 0 + for _, node := range evidence.ClusterState.Nodes { + if node.Status != "Ready" { + notReadyCount++ + } + } + if notReadyCount > 0 { + sb.WriteString(fmt.Sprintf("⚠️ Not Ready Nodes: %d\n", notReadyCount)) + for _, node := range evidence.ClusterState.Nodes { + if node.Status != "Ready" { + sb.WriteString(fmt.Sprintf(" - %s\n", node.Name)) + } + } + sb.WriteString("\n") + } + } + + if evidence.CloudTrailData != nil { + sb.WriteString("CLOUDTRAIL DATA\n") + sb.WriteString("---------------\n") + sb.WriteString(fmt.Sprintf("Error Events: %d\n", len(evidence.CloudTrailData.ErrorEvents))) + sb.WriteString(fmt.Sprintf("Write Events: %d\n\n", len(evidence.CloudTrailData.WriteEvents))) + + if len(evidence.CloudTrailData.ErrorEvents) > 0 { + sb.WriteString("Recent Errors:\n") + maxErrors := 10 + if len(evidence.CloudTrailData.ErrorEvents) < maxErrors { + maxErrors = len(evidence.CloudTrailData.ErrorEvents) + } + for i := 0; i < maxErrors; i++ { + e := evidence.CloudTrailData.ErrorEvents[i] + sb.WriteString(fmt.Sprintf(" - %s: %s (%s)\n", e.EventTime, e.EventName, e.ErrorCode)) + } + sb.WriteString("\n") + } + } + + return os.WriteFile(filename, []byte(sb.String()), 0600) +} + +func parseDurationToUTC(input string) (time.Time, error) { + duration, err := time.ParseDuration(input) + if err != nil { + return time.Time{}, fmt.Errorf("unable to parse time duration: %w", err) + } + if duration <= 0 { + return time.Time{}, fmt.Errorf("duration must be positive (e.g., 1h, 30m)") + } + return time.Now().UTC().Add(-duration), nil +} + +// verifyClusterContext checks if the current oc context appears to match the target cluster +func (o *collectOptions) verifyClusterContext(clusterID string) error { + // Get current context info by checking cluster-info + output, err := exec.CommandContext(context.TODO(), "oc", "whoami", "--show-server").Output() + if err != nil { + return fmt.Errorf("unable to verify cluster context: %w", err) + } + + serverURL := strings.TrimSpace(string(output)) + // Check if server URL contains the cluster ID (common pattern for backplane URLs) + if !strings.Contains(serverURL, clusterID) { + return fmt.Errorf("current context server (%s) may not match target cluster (%s)", serverURL, clusterID) + } + + return nil +} diff --git a/docs/README.md b/docs/README.md index 458cac5e2..edab75345 100644 --- a/docs/README.md +++ b/docs/README.md @@ -37,6 +37,7 @@ - `list --cluster-id ` - List all silences - `org [--all --duration --comment | --alertname --duration --comment]` - Add new silence for alert for org - `cloudtrail` - AWS CloudTrail related utilities + - `errors` - Prints CloudTrail error events (permission/IAM issues) to console. - `permission-denied-events` - Prints cloudtrail permission-denied events to console. - `write-events` - Prints cloudtrail write events to console with advanced filtering options - `cluster` - Provides information for a specified cluster @@ -48,6 +49,7 @@ - `context --cluster-id ` - Shows the context of a specified cluster - `cpd` - Runs diagnostic for a Cluster Provisioning Delay (CPD) - `detach-stuck-volume --cluster-id ` - Detach openshift-monitoring namespace's volume from a cluster forcefully + - `diff ` - Compare two cluster snapshots to identify changes - `etcd-health-check --cluster-id --reason ` - Checks the etcd components and member health - `etcd-member-replace --cluster-id ` - Replaces an unhealthy etcd node - `from-infra-id` - Get cluster ID and external ID from a given infrastructure ID commonly used by Splunk @@ -66,6 +68,7 @@ - `infra` - Resize an OSD/ROSA cluster's infra nodes - `request-serving-nodes` - Resize a ROSA HCP cluster's request-serving nodes - `resync` - Force a resync of a cluster from Hive + - `snapshot` - Capture a point-in-time snapshot of cluster state - `sre-operators` - SRE operator related utilities - `describe` - Describe SRE operators - `list` - List the current and latest version of SRE operators @@ -91,6 +94,8 @@ - `logs --cluster-id ` - Fetch logs from Dynatrace - `url --cluster-id ` - Get the Dynatrace Tenant URL for a given MC or HCP cluster - `env [flags] [env-alias]` - Create an environment to interact with a cluster +- `evidence` - Evidence collection utilities for feature testing + - `collect` - Collect evidence from cluster and AWS for feature testing - `hcp` - - `force-upgrade` - Schedule forced control plane upgrade for HCP clusters (Requires ForceUpgrader permissions) - `get-cp-autoscaling-status` - Get control plane autoscaling status for hosted clusters on a management cluster @@ -1113,6 +1118,47 @@ osdctl cloudtrail [flags] -S, --skip-version-check skip checking to see if this is the most recent release ``` +### osdctl cloudtrail errors + +Surfaces permission and IAM-related errors from AWS CloudTrail. + +By default, matches these error patterns: + - AccessDenied + - UnauthorizedOperation / Client.UnauthorizedOperation + - Forbidden + - InvalidClientTokenId + - AuthFailure + - ExpiredToken + - SignatureDoesNotMatch + +Use --error-types to filter for specific error patterns. + +``` +osdctl cloudtrail errors [flags] +``` + +#### Flags + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + -C, --cluster-id string Cluster ID + --context string The name of the kubeconfig context to use + --error-types strings Comma-separated list of error patterns to match (default: all common permission errors) + -h, --help help for errors + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --json Output results as JSON + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + -r, --raw-event Print raw CloudTrail event JSON + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --since string Time window to search (e.g., 30m, 1h, 24h). Valid units: ns, us, ms, s, m, h. (default "1h") + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release + -u, --url Include console URL links for each event +``` + ### osdctl cloudtrail permission-denied-events Prints cloudtrail permission-denied events to console. @@ -1475,6 +1521,40 @@ osdctl cluster detach-stuck-volume --cluster-id [flags] -S, --skip-version-check skip checking to see if this is the most recent release ``` +### osdctl cluster diff + +Compare two cluster snapshots to identify changes. + +This command compares two snapshot files created by 'osdctl cluster snapshot' +and reports the differences. This is useful for understanding what changed +in a cluster during feature testing or validation. + +Changes are categorized as: +- added: Resource exists in after but not in before +- removed: Resource exists in before but not in after +- modified: Resource exists in both but with different values + +``` +osdctl cluster diff [flags] +``` + +#### Flags + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + -h, --help help for diff + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --json Output diff in JSON format + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + ### osdctl cluster etcd-health-check Checks etcd component health status for member replacement @@ -2018,6 +2098,42 @@ osdctl cluster resync [flags] -S, --skip-version-check skip checking to see if this is the most recent release ``` +### osdctl cluster snapshot + +Capture a point-in-time snapshot of cluster state for evidence collection. + +This command captures the current state of key cluster resources including: +- Namespace states +- Node conditions and readiness +- ClusterOperator status +- Custom resources (optional) + +The snapshot can be saved to a YAML file and later compared using +'osdctl cluster diff' to identify changes during feature testing. + +``` +osdctl cluster snapshot [flags] +``` + +#### Flags + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + -C, --cluster-id string Cluster ID (internal, external, or name) + --context string The name of the kubeconfig context to use + -h, --help help for snapshot + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + --namespaces strings Specific namespaces to include (default: all openshift-* namespaces) + -o, --output string Output file path (YAML format) + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + --resources strings Additional resource types to capture (e.g., pods,deployments) + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + ### osdctl cluster sre-operators SRE operator related utilities @@ -2812,6 +2928,73 @@ osdctl env [flags] [env-alias] -u, --username string Username for individual cluster login ``` +### osdctl evidence + +Evidence collection utilities for feature testing. + +This command group provides tools to help SRE teams collect evidence +during feature validation testing. The collected evidence can include +CloudTrail logs, cluster snapshots, and other diagnostic information. + +``` +osdctl evidence [flags] +``` + +#### Flags + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + -h, --help help for evidence + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + +### osdctl evidence collect + +Collect comprehensive evidence from a cluster and AWS for feature testing. + +This all-in-one command gathers: +- Cluster state (nodes, operators, machine configs) +- CloudTrail error events (permission denied, etc.) +- Recent Kubernetes events (optional) +- must-gather output (optional) + +The collected evidence is saved to the specified output directory for +inclusion in test reports and feature validation documentation. + +``` +osdctl evidence collect [flags] +``` + +#### Flags + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + -C, --cluster-id string Cluster ID (internal, external, or name) + --context string The name of the kubeconfig context to use + -h, --help help for collect + --include-events Include Kubernetes events in collection + --include-must-gather Run must-gather and include output + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Output directory for collected evidence + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --since string Time window to look back for events (e.g., 30m, 1h, 2h) (default "1h") + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + --skip-cloudtrail Skip CloudTrail event collection + --skip-cluster-state Skip cluster state collection + -S, --skip-version-check skip checking to see if this is the most recent release +``` + ### osdctl hcp ``` diff --git a/docs/osdctl.md b/docs/osdctl.md index 209867412..612e22a25 100644 --- a/docs/osdctl.md +++ b/docs/osdctl.md @@ -32,6 +32,7 @@ CLI tool to provide OSD related utilities * [osdctl cost](osdctl_cost.md) - Cost Management related utilities * [osdctl dynatrace](osdctl_dynatrace.md) - Dynatrace related utilities * [osdctl env](osdctl_env.md) - Create an environment to interact with a cluster +* [osdctl evidence](osdctl_evidence.md) - Evidence collection utilities for feature testing * [osdctl hcp](osdctl_hcp.md) - * [osdctl hive](osdctl_hive.md) - hive related utilities * [osdctl iampermissions](osdctl_iampermissions.md) - STS/WIF utilities diff --git a/docs/osdctl_cloudtrail.md b/docs/osdctl_cloudtrail.md index 754263018..9628ecd81 100644 --- a/docs/osdctl_cloudtrail.md +++ b/docs/osdctl_cloudtrail.md @@ -30,6 +30,7 @@ osdctl cloudtrail [flags] ### SEE ALSO * [osdctl](osdctl.md) - OSD CLI +* [osdctl cloudtrail errors](osdctl_cloudtrail_errors.md) - Prints CloudTrail error events (permission/IAM issues) to console. * [osdctl cloudtrail permission-denied-events](osdctl_cloudtrail_permission-denied-events.md) - Prints cloudtrail permission-denied events to console. * [osdctl cloudtrail write-events](osdctl_cloudtrail_write-events.md) - Prints cloudtrail write events to console with advanced filtering options diff --git a/docs/osdctl_cloudtrail_errors.md b/docs/osdctl_cloudtrail_errors.md new file mode 100644 index 000000000..8522d9998 --- /dev/null +++ b/docs/osdctl_cloudtrail_errors.md @@ -0,0 +1,70 @@ +## osdctl cloudtrail errors + +Prints CloudTrail error events (permission/IAM issues) to console. + +### Synopsis + +Surfaces permission and IAM-related errors from AWS CloudTrail. + +By default, matches these error patterns: + - AccessDenied + - UnauthorizedOperation / Client.UnauthorizedOperation + - Forbidden + - InvalidClientTokenId + - AuthFailure + - ExpiredToken + - SignatureDoesNotMatch + +Use --error-types to filter for specific error patterns. + +``` +osdctl cloudtrail errors [flags] +``` + +### Examples + +``` + # Check for permission errors in the last hour + osdctl cloudtrail errors -C --since 1h + + # Check for specific error types only + osdctl cloudtrail errors -C --error-types AccessDenied,Forbidden + + # Output as JSON for scripting + osdctl cloudtrail errors -C --json + + # Include console links for each event + osdctl cloudtrail errors -C --url +``` + +### Options + +``` + -C, --cluster-id string Cluster ID + --error-types strings Comma-separated list of error patterns to match (default: all common permission errors) + -h, --help help for errors + --json Output results as JSON + -r, --raw-event Print raw CloudTrail event JSON + --since string Time window to search (e.g., 30m, 1h, 24h). Valid units: ns, us, ms, s, m, h. (default "1h") + -u, --url Include console URL links for each event +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + +### SEE ALSO + +* [osdctl cloudtrail](osdctl_cloudtrail.md) - AWS CloudTrail related utilities + diff --git a/docs/osdctl_cluster.md b/docs/osdctl_cluster.md index d4e8e315f..2c30a635a 100644 --- a/docs/osdctl_cluster.md +++ b/docs/osdctl_cluster.md @@ -32,6 +32,7 @@ Provides information for a specified cluster * [osdctl cluster context](osdctl_cluster_context.md) - Shows the context of a specified cluster * [osdctl cluster cpd](osdctl_cluster_cpd.md) - Runs diagnostic for a Cluster Provisioning Delay (CPD) * [osdctl cluster detach-stuck-volume](osdctl_cluster_detach-stuck-volume.md) - Detach openshift-monitoring namespace's volume from a cluster forcefully +* [osdctl cluster diff](osdctl_cluster_diff.md) - Compare two cluster snapshots to identify changes * [osdctl cluster etcd-health-check](osdctl_cluster_etcd-health-check.md) - Checks the etcd components and member health * [osdctl cluster etcd-member-replace](osdctl_cluster_etcd-member-replace.md) - Replaces an unhealthy etcd node * [osdctl cluster from-infra-id](osdctl_cluster_from-infra-id.md) - Get cluster ID and external ID from a given infrastructure ID commonly used by Splunk @@ -44,6 +45,7 @@ Provides information for a specified cluster * [osdctl cluster reports](osdctl_cluster_reports.md) - Manage cluster reports in backplane-api * [osdctl cluster resize](osdctl_cluster_resize.md) - resize control-plane/infra nodes * [osdctl cluster resync](osdctl_cluster_resync.md) - Force a resync of a cluster from Hive +* [osdctl cluster snapshot](osdctl_cluster_snapshot.md) - Capture a point-in-time snapshot of cluster state * [osdctl cluster sre-operators](osdctl_cluster_sre-operators.md) - SRE operator related utilities * [osdctl cluster ssh](osdctl_cluster_ssh.md) - utilities for accessing cluster via ssh * [osdctl cluster support](osdctl_cluster_support.md) - Cluster Support diff --git a/docs/osdctl_cluster_diff.md b/docs/osdctl_cluster_diff.md new file mode 100644 index 000000000..a98aa8ffe --- /dev/null +++ b/docs/osdctl_cluster_diff.md @@ -0,0 +1,57 @@ +## osdctl cluster diff + +Compare two cluster snapshots to identify changes + +### Synopsis + +Compare two cluster snapshots to identify changes. + +This command compares two snapshot files created by 'osdctl cluster snapshot' +and reports the differences. This is useful for understanding what changed +in a cluster during feature testing or validation. + +Changes are categorized as: +- added: Resource exists in after but not in before +- removed: Resource exists in before but not in after +- modified: Resource exists in both but with different values + +``` +osdctl cluster diff [flags] +``` + +### Examples + +``` + # Compare two snapshots + osdctl cluster diff before.yaml after.yaml + + # Compare snapshots with JSON output + osdctl cluster diff before.yaml after.yaml --json +``` + +### Options + +``` + -h, --help help for diff + --json Output diff in JSON format +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + +### SEE ALSO + +* [osdctl cluster](osdctl_cluster.md) - Provides information for a specified cluster + diff --git a/docs/osdctl_cluster_snapshot.md b/docs/osdctl_cluster_snapshot.md new file mode 100644 index 000000000..27598f6cb --- /dev/null +++ b/docs/osdctl_cluster_snapshot.md @@ -0,0 +1,62 @@ +## osdctl cluster snapshot + +Capture a point-in-time snapshot of cluster state + +### Synopsis + +Capture a point-in-time snapshot of cluster state for evidence collection. + +This command captures the current state of key cluster resources including: +- Namespace states +- Node conditions and readiness +- ClusterOperator status +- Custom resources (optional) + +The snapshot can be saved to a YAML file and later compared using +'osdctl cluster diff' to identify changes during feature testing. + +``` +osdctl cluster snapshot [flags] +``` + +### Examples + +``` + # Capture cluster snapshot to a file + osdctl cluster snapshot -C -o before.yaml + + # Capture snapshot with specific namespaces + osdctl cluster snapshot -C -o snapshot.yaml --namespaces openshift-monitoring,openshift-operators + + # Capture additional resource types + osdctl cluster snapshot -C -o snapshot.yaml --resources pods,deployments,services +``` + +### Options + +``` + -C, --cluster-id string Cluster ID (internal, external, or name) + -h, --help help for snapshot + --namespaces strings Specific namespaces to include (default: all openshift-* namespaces) + -o, --output string Output file path (YAML format) + --resources strings Additional resource types to capture (e.g., pods,deployments) +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + +### SEE ALSO + +* [osdctl cluster](osdctl_cluster.md) - Provides information for a specified cluster + diff --git a/docs/osdctl_evidence.md b/docs/osdctl_evidence.md new file mode 100644 index 000000000..8c95ffbd7 --- /dev/null +++ b/docs/osdctl_evidence.md @@ -0,0 +1,42 @@ +## osdctl evidence + +Evidence collection utilities for feature testing + +### Synopsis + +Evidence collection utilities for feature testing. + +This command group provides tools to help SRE teams collect evidence +during feature validation testing. The collected evidence can include +CloudTrail logs, cluster snapshots, and other diagnostic information. + +``` +osdctl evidence [flags] +``` + +### Options + +``` + -h, --help help for evidence +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + +### SEE ALSO + +* [osdctl](osdctl.md) - OSD CLI +* [osdctl evidence collect](osdctl_evidence_collect.md) - Collect evidence from cluster and AWS for feature testing + diff --git a/docs/osdctl_evidence_collect.md b/docs/osdctl_evidence_collect.md new file mode 100644 index 000000000..27f9ee0fa --- /dev/null +++ b/docs/osdctl_evidence_collect.md @@ -0,0 +1,68 @@ +## osdctl evidence collect + +Collect evidence from cluster and AWS for feature testing + +### Synopsis + +Collect comprehensive evidence from a cluster and AWS for feature testing. + +This all-in-one command gathers: +- Cluster state (nodes, operators, machine configs) +- CloudTrail error events (permission denied, etc.) +- Recent Kubernetes events (optional) +- must-gather output (optional) + +The collected evidence is saved to the specified output directory for +inclusion in test reports and feature validation documentation. + +``` +osdctl evidence collect [flags] +``` + +### Examples + +``` + # Collect all evidence to a directory + osdctl evidence collect -C --output ./evidence/ + + # Collect evidence from the last 2 hours + osdctl evidence collect -C --output ./evidence/ --since 2h + + # Collect evidence without CloudTrail (for non-AWS or limited access) + osdctl evidence collect -C --output ./evidence/ --skip-cloudtrail + + # Include Kubernetes events in collection + osdctl evidence collect -C --output ./evidence/ --include-events +``` + +### Options + +``` + -C, --cluster-id string Cluster ID (internal, external, or name) + -h, --help help for collect + --include-events Include Kubernetes events in collection + --include-must-gather Run must-gather and include output + -o, --output string Output directory for collected evidence + --since string Time window to look back for events (e.g., 30m, 1h, 2h) (default "1h") + --skip-cloudtrail Skip CloudTrail event collection + --skip-cluster-state Skip cluster state collection +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + +### SEE ALSO + +* [osdctl evidence](osdctl_evidence.md) - Evidence collection utilities for feature testing +