Povești adevărate din câmpul de luptă (de date) – Partea 1: Comunicarea despre date

URMĂREȘTE-NE
16,065FaniÎmi place
1,142CititoriConectați-vă
  

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(ggpubr)
library(corrplot)


# Set seed for reproducibility
set.seed(43)

################################
# The graph was too complicated
################################


# Generate a random permutation of 8 predefined group means
# These represent different average values for each group
group_means <- sample(c(10, 15, 20, 20.01, 30, 35, 40, 40.2))

# Display the randomly sampled group means
group_means

# Create a data frame with two columns: Group and Value
# Group: factor with 8 levels, each repeated 30 times (240 observations in total)
# Value: for each group mean, generate 30 random values from a normal distribution
# with the specified mean from the group_means vector and a standard deviation of 5
data_too_complicated <- data.frame(
  Group = factor(rep(1:8, each = 30)),
  Value = unlist(lapply(group_means, function(mean) rnorm(30, mean, 5)))
)


# Initialize a ggplot object using the dataset 'data_too_complicated'
ggplot(data_too_complicated, aes(x = Group, y = Value, fill = Group)) +
 # Add a boxplot layer to show the distribution of values for each group
  geom_boxplot()  +
  # Overlay jittered points to display individual observations
  # width = 0.2 controls horizontal spread; alpha = 0.5 makes points semi-transparent
  geom_jitter(width = 0.2, alpha = 0.5)  +
  # Add statistical comparison between all pairs of groups using t-tests
  # comparisons = all pairwise combinations of Group levels
  # label = "p.signif" shows significance stars; hide.ns = TRUE hides non-significant results
  stat_compare_means(comparisons = combn(levels(data_too_complicated$Group), 2, simplify = FALSE),
                     method = "t.test", label = "p.signif", hide.ns = TRUE) +
  # Apply a minimal theme for a clean look
  theme_minimal() +
  # Add plot title and axis labels; customize legend title for fill
  labs(title = "Estimated Profit Per Client Group",
       x = "Client Group",
       y = "Value",
       fill="Client Group") +
  # Format y-axis: scientific notation for labels and 10 evenly spaced breaks
  scale_y_continuous(labels = scales::scientific,
                     breaks = scales::pretty_breaks(n = 10)) +
  # increase y-axis text size and add a black border around the plot
  theme(axis.text.y = element_text(size = 12),
        plot.background = element_rect(colour = "black", fill = NA, size = 1))

###########################
# The graph was too simple
###########################


# simulate the promotion data
data_too_simple <- data.frame(
  # Define the 'Promotion' column as a factor with two levels: Promotion A and Promotion B
  Promotion = factor(c("Promotion A", "Promotion B")),
  # Define the 'Value' column: calculated values for each promotion
  # Promotion A: 23 * 1.5 * 2.1; Promotion B: 23 * 2.1
  Value = c(72.5, 48),
  # Define the 'SE' (Standard Error) column for each promotion
  # Promotion A: 2.5; Promotion B: 2
  SE = c(2.5, 2)
)


# plot the simulated data
ggplot(data_too_simple, aes(x = Promotion, y = Value, fill = Promotion)) +
  # Add bar chart layer with actual values (stat = "identity")
  # Bars are dodged for side-by-side comparison and width set to 0.7
  geom_bar(stat = "identity", position = position_dodge(), width = 0.7) +
  # Add error bars to represent standard errors
  # ymin and ymax define lower and upper bounds; width controls bar cap size
  # Position dodged to align with bars
  geom_errorbar(aes(ymin = Value - SE, ymax = Value + SE), width = 0.2, position = position_dodge(0.7)) +
  # Add text labels showing rounded Value with a percentage sign
  # vjust adjusts vertical position above bars; size sets font size
  geom_text(aes(label = paste0(round(Value), "%")), vjust = -1.5, size = 5) +
  # Manually set fill colors for each promotion for a visually appealing palette
  scale_fill_manual(values = c("Promotion A" = "#FF5733", "Promotion B" = "#33C3FF")) +
  # Apply a minimal theme for a clean and modern look
  theme_minimal() +
  # Add plot title, axis labels, and subtitle explaining error bars
  labs(title = "Comparison of Promotion A and Promotion B",
       x = "Promotion",
       y = "Customer Purchase (%)",
       subtitle = "Error bars represent standard errors") +
  # Configure y-axis: set limits from 0 to 100 and breaks every 10 units
  scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 10)) +
  # Customize theme: axis text and titles size, bold plot title, and black border around plot
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        plot.title = element_text(size = 16, face = "bold"),
        plot.background = element_rect(colour = "black", fill = NA, size = 1))

##########################################
# Graph leads to stakeholder tunnel vision
##########################################


# Load the mtcars dataset
data(mtcars) 
# make a subselection of the columns
mtcars <- mtcars %>% 
  select(wt, hp, cyl, disp, qsec, mpg, drat)

# Calculate the correlation matrix
cor_matrix <- cor(mtcars)

# Rename the columns of the correlation matrix to proper English names
colnames(cor_matrix) <- c("Weight", "Horsepower", "Cylinders", "Displacement", 
                          "1/4 Mile Time", "Miles per Gallon", "Rear Axle Ratio")
rownames(cor_matrix) <- colnames(cor_matrix)

# Create a correlation plot using circles to represent correlation strength 
corrplot(cor_matrix, method = "circle", type = "upper",
         # Define color palette: gradient from red (negative) to white (neutral) to blue (positive)
         # Generate 200 color steps for smooth transitions
         col = colorRampPalette(c("red", "white", "blue"))(200),
         # Set text label size for variable names and color to black
         tl.cex = 0.8, tl.col = "black",
         # Set color legend size and position on the right
         cl.cex = 0.8, cl.pos = "r",
         # Add a title to the plot
         title="Correlation of Important Variables",
         # Adjust plot margins: bottom, left, top, right
         mar = c(1, 1, 2, 1))
# put a box around the edge of the plot
box(which = "figure", col = "black", lwd = 3)


##########################################
# When Data Reveal an Uncomfortable Truth
##########################################

# Simulated data for 7 cities
data_uncomfortable_truth <- data.frame(
  City = c("London", "Paris", "Berlin", "Madrid", "Rome", "Amsterdam", "Vienna"),
  Business_Metric = c(85, 78, 65, 60, 55, 50, 45)
)

# Order the cities by Business Metric from largest to smallest
data_uncomfortable_truth <- data_uncomfortable_truth %>%
  arrange(desc(Business_Metric))


# Create the bar chart
data_uncomfortable_truth %>%
  # set up plot aesthetics
  ggplot(aes(x = reorder(City, -Business_Metric), 
             y = Business_Metric, 
             fill = City)) +
  # specify barplot and width of bars
  geom_bar(stat = "identity", width = 0.7) +
  # minimal theme
  theme_minimal() +
  # make axis labels and title
  labs(title = "Business Metric by City",
       x = "City",
       y = "Business Metric") +
  # format text and background of the plot
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(size = 16, face = "bold"),
        legend.position = "none",
        plot.background = element_rect(colour = "black", fill = NA, size = 1))
Dominic Botezariu
Dominic Botezariuhttps://www.noobz.ro/
Creator de site și redactor-șef.

Cele mai noi știri

Pe același subiect

LĂSAȚI UN MESAJ

Vă rugăm să introduceți comentariul dvs.!
Introduceți aici numele dvs.