data <- read_xlsx("Crown measurement dataset Kem Kem theropods.xlsx")
data<-data[data$`Taxa (Genus)`!="Abelisauridae",]
data[data == "?"] <- NA
data[data == "~"] <- NA
data[data == "/"] <- NA
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("\\? ","", x)
})
data[] <- lapply(data, function(x) {
gsub("absent",0, x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("\\?","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("\\>","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("\\<","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub(">","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("<","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("\\~","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("~","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub(":","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub(";","", x)
})
data$LIF<- ifelse(data$LIF=="6-7",6.5,
ifelse(data$LIF=="5-6",5.5,
ifelse(data$LIF=="4-5",4.5,
ifelse(data$LIF=="3-4","3.5",
ifelse(data$LIF=="11 or 12",11.5,
ifelse(data$LIF=="10-13",12,data$LIF))))))
data$CH<-data$CH...22
data<- data %>% select(-CH...60,-CH...22,-`(DDL/CH)*100`)
data1<-data
data<-data[,-c(1,2,5,4,6:14,16:19)]#until 19
data$`TransvUndu`<-ifelse(data$`Transv. Undu.`!=0 & !is.na(data$`Transv. Undu.`),1,data$`Transv. Undu.`)
data$`Interdentsulci`<-ifelse(data$`Interdent. sulci`!=0 & !is.na(data$`Interdent. sulci`),1,data$`Interdent. sulci`)
data$LAF<-ifelse(data$LAF=="6-7",6.5,data$LAF)
data$CTU1 <- sub(".*?(\\d+).*", "\\1", data$CTU)
data<- data %>% select(-CTU,-`Interdent. sulci`,-`Transv. Undu.`)
# Convert columns to numeric, then create log-transformed columns
data <- data %>%
mutate(across(3:ncol(data), as.numeric)) %>%
mutate(across(3:ncol(data), log, .names = "Log_{.col}"))
data$TaxonToothtype<-as.factor(data$TaxonToothtype)
#data$Epoch<-ifelse(data$Epoch=="'Middle Cretaceous'","Middle Cretaceous", data$Epoch)
#data$Epoch<-as.factor(data$Epoch)
data<-data.frame(data)
## TaxonToothtype Epoch CBL
## Saurornitholestes lateral:133 Length:1349 Min. : 0.380
## Tyrannosaurus lateral :114 Class :character 1st Qu.: 4.205
## Acrocanthosaurus lateral : 48 Mode :character Median : 9.900
## Richardoestesia lateral : 46 Mean :13.960
## Majungasaurus lateral : 41 3rd Qu.:19.990
## Pectinodon lateral : 40 Max. :54.500
## (Other) :927 NA's :3
## CBW AL CBR CHR
## Min. : 0.540 Min. : 0.55 Min. :0.2500 Min. :0.400
## 1st Qu.: 2.300 1st Qu.: 12.24 1st Qu.:0.4595 1st Qu.:1.632
## Median : 6.200 Median : 30.52 Median :0.5453 Median :1.905
## Mean : 9.128 Mean : 37.46 Mean :0.5935 Mean :1.933
## 3rd Qu.:13.560 3rd Qu.: 55.81 3rd Qu.:0.6917 3rd Qu.:2.191
## Max. :48.600 Max. :152.84 Max. :2.1841 Max. :4.222
## NA's :68 NA's :338 NA's :80 NA's :13
## MCL MCW MCR MDE
## Min. : 0.320 Min. : 0.940 Min. :0.3841 Min. :-13.880
## 1st Qu.: 6.685 1st Qu.: 4.500 1st Qu.:0.5034 1st Qu.: 0.000
## Median :12.130 Median : 7.460 Median :0.5786 Median : 0.000
## Mean :13.053 Mean : 8.327 Mean :0.6109 Mean : 5.752
## 3rd Qu.:18.015 3rd Qu.:11.050 3rd Qu.:0.6896 3rd Qu.: 8.592
## Max. :37.100 Max. :30.200 Max. :1.2792 Max. : 58.400
## NA's :862 NA's :892 NA's :893 NA's :1009
## MSL MEC LAF LIF
## Min. : 1.42 Min. : 0.00 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 17.41 1st Qu.: 68.65 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 28.07 Median :100.00 Median : 0.000 Median : 0.0000
## Mean : 32.88 Mean : 83.59 Mean : 0.407 Mean : 0.4899
## 3rd Qu.: 44.37 3rd Qu.:100.00 3rd Qu.: 0.000 3rd Qu.: 0.0000
## Max. :123.63 Max. :113.69 Max. :15.000 Max. :15.0000
## NA's :1010 NA's :1016 NA's :736 NA's :752
## DMT DDT DLAT DLIT
## Min. :0.100 Min. : 0.100 Min. :0.100 Min. :0.10
## 1st Qu.:1.400 1st Qu.: 1.200 1st Qu.:1.100 1st Qu.:1.10
## Median :2.140 Median : 3.200 Median :2.700 Median :2.26
## Mean :2.900 Mean : 3.191 Mean :2.695 Mean :2.48
## 3rd Qu.:4.492 3rd Qu.: 4.350 3rd Qu.:4.060 3rd Qu.:3.39
## Max. :8.500 Max. :10.320 Max. :8.140 Max. :7.95
## NA's :1303 NA's :1304 NA's :1302 NA's :1304
## CA CA2 MA MC
## Min. : 8.50 Min. :-1.1200 Min. : 4.66 Min. : 4.70
## 1st Qu.:68.27 1st Qu.:-0.0800 1st Qu.: 9.00 1st Qu.: 9.16
## Median :83.26 Median : 0.0100 Median :11.10 Median :12.00
## Mean :74.85 Mean : 0.0016 Mean :13.75 Mean :16.16
## 3rd Qu.:86.34 3rd Qu.: 0.1000 3rd Qu.:14.00 3rd Qu.:19.00
## Max. :88.11 Max. : 0.3600 Max. :60.00 Max. :57.90
## NA's :986 NA's :1007 NA's :873 NA's :584
## MB DA DC DB
## Min. : 6.00 Min. : 4.00 Min. : 0.00 Min. : 6.00
## 1st Qu.:11.00 1st Qu.: 9.50 1st Qu.:10.03 1st Qu.:11.50
## Median :13.00 Median :12.00 Median :15.00 Median :14.60
## Mean :14.25 Mean :13.91 Mean :17.87 Mean :16.51
## 3rd Qu.:16.00 3rd Qu.:15.00 3rd Qu.:21.25 3rd Qu.:18.56
## Max. :45.00 Max. :71.00 Max. :70.00 Max. :80.00
## NA's :997 NA's :776 NA's :187 NA's :829
## MAVG DAVG DAVG2 TDD
## Min. : 0.000 Min. : 1.56 Min. :-0.920 Min. : 0.20
## 1st Qu.: 9.195 1st Qu.: 7.50 1st Qu.:-0.090 1st Qu.: 35.95
## Median :11.854 Median :11.50 Median : 0.011 Median : 57.19
## Mean :14.457 Mean :13.71 Mean : 0.899 Mean : 77.80
## 3rd Qu.:16.144 3rd Qu.:16.25 3rd Qu.: 0.130 3rd Qu.:108.64
## Max. :55.000 Max. :80.00 Max. :35.000 Max. :368.62
## NA's :633 NA's :272 NA's :1037 NA's :281
## DSDI CMA CAA CDA
## Min. : 0.6654 Min. : 16.11 Min. : 3.151 Min. : 17.79
## 1st Qu.: 0.9322 1st Qu.: 58.10 1st Qu.:23.475 1st Qu.: 80.06
## Median : 1.0000 Median : 64.80 Median :27.060 Median : 87.06
## Mean : 3.8502 Mean : 64.54 Mean :27.313 Mean : 88.15
## 3rd Qu.: 1.1167 3rd Qu.: 71.52 3rd Qu.:31.113 3rd Qu.: 94.38
## Max. :269.8500 Max. :148.96 Max. :74.262 Max. :160.74
## NA's :730 NA's :432 NA's :427 NA's :428
## MDL DDL ...61 CH
## Min. :0.0864 Min. :0.07143 Min. : NA Min. : 0.57
## 1st Qu.:0.2632 1st Qu.:0.23529 1st Qu.: NA 1st Qu.: 7.60
## Median :0.4167 Median :0.33333 Median : NA Median : 18.08
## Mean :0.4079 Mean :0.36173 Mean :NaN Mean : 28.11
## 3rd Qu.:0.5459 3rd Qu.:0.49384 3rd Qu.: NA 3rd Qu.: 41.14
## Max. :1.0638 Max. :1.11111 Max. : NA Max. :145.55
## NA's :584 NA's :187 NA's :1349 NA's :6
## TransvUndu Interdentsulci CTU1 Log_CBL
## Min. :0.0000 Min. :0.0000 Min. : 0.000 Min. :-0.9676
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.: 1.4363
## Median :1.0000 Median :0.0000 Median : 2.000 Median : 2.2925
## Mean :0.5215 Mean :0.4162 Mean : 1.513 Mean : 2.1891
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 3.000 3rd Qu.: 2.9952
## Max. :1.0000 Max. :1.0000 Max. :10.000 Max. : 3.9982
## NA's :954 NA's :979 NA's :953 NA's :3
## Log_CBW Log_AL Log_CBR Log_CHR
## Min. :-0.6162 Min. :-0.5978 Min. :-1.3863 Min. :-0.9163
## 1st Qu.: 0.8329 1st Qu.: 2.5051 1st Qu.:-0.7777 1st Qu.: 0.4895
## Median : 1.8245 Median : 3.4184 Median :-0.6064 Median : 0.6444
## Mean : 1.7245 Mean : 3.1987 Mean :-0.5725 Mean : 0.6328
## 3rd Qu.: 2.6071 3rd Qu.: 4.0220 3rd Qu.:-0.3685 3rd Qu.: 0.7842
## Max. : 3.8836 Max. : 5.0294 Max. : 0.7812 Max. : 1.4404
## NA's :68 NA's :338 NA's :80 NA's :13
## Log_MCL Log_MCW Log_MCR Log_MDE
## Min. :-1.139 Min. :-0.0619 Min. :-0.9570 Min. : -Inf
## 1st Qu.: 1.900 1st Qu.: 1.5041 1st Qu.:-0.6865 1st Qu.: -Inf
## Median : 2.496 Median : 2.0096 Median :-0.5471 Median : -Inf
## Mean : 2.335 Mean : 1.9125 Mean :-0.5191 Mean : -Inf
## 3rd Qu.: 2.891 3rd Qu.: 2.4024 3rd Qu.:-0.3717 3rd Qu.:2.184
## Max. : 3.614 Max. : 3.4078 Max. : 0.2462 Max. :4.067
## NA's :862 NA's :892 NA's :893 NA's :1029
## Log_MSL Log_MEC Log_LAF Log_LIF
## Min. :0.3507 Min. : -Inf Min. : -Inf Min. : -Inf
## 1st Qu.:2.8568 1st Qu.:4.229 1st Qu.: -Inf 1st Qu.: -Inf
## Median :3.3347 Median :4.605 Median : -Inf Median : -Inf
## Mean :3.2561 Mean : -Inf Mean : -Inf Mean : -Inf
## 3rd Qu.:3.7926 3rd Qu.:4.605 3rd Qu.: -Inf 3rd Qu.: -Inf
## Max. :4.8173 Max. :4.734 Max. :2.708 Max. :2.708
## NA's :1010 NA's :1016 NA's :736 NA's :752
## Log_DMT Log_DDT Log_DLAT Log_DLIT
## Min. :-2.3026 Min. :-2.3026 Min. :-2.3026 Min. :-2.3026
## 1st Qu.: 0.3365 1st Qu.: 0.1823 1st Qu.: 0.0912 1st Qu.: 0.0953
## Median : 0.7604 Median : 1.1632 Median : 0.9933 Median : 0.8154
## Mean : 0.7138 Mean : 0.7984 Mean : 0.5972 Mean : 0.5271
## 3rd Qu.: 1.5024 3rd Qu.: 1.4702 3rd Qu.: 1.4011 3rd Qu.: 1.2208
## Max. : 2.1401 Max. : 2.3341 Max. : 2.0968 Max. : 2.0732
## NA's :1303 NA's :1304 NA's :1302 NA's :1304
## Log_CA Log_CA2 Log_MA Log_MC
## Min. :2.140 Min. : -Inf Min. :1.539 Min. :1.548
## 1st Qu.:4.223 1st Qu.:-3.219 1st Qu.:2.197 1st Qu.:2.215
## Median :4.422 Median :-2.408 Median :2.407 Median :2.485
## Mean :4.272 Mean : -Inf Mean :2.495 Mean :2.631
## 3rd Qu.:4.458 3rd Qu.:-1.897 3rd Qu.:2.639 3rd Qu.:2.944
## Max. :4.479 Max. :-1.022 Max. :4.094 Max. :4.059
## NA's :986 NA's :1163 NA's :873 NA's :584
## Log_MB Log_DA Log_DC Log_DB
## Min. :1.792 Min. :1.386 Min. : -Inf Min. :1.792
## 1st Qu.:2.398 1st Qu.:2.251 1st Qu.:2.305 1st Qu.:2.442
## Median :2.565 Median :2.485 Median :2.708 Median :2.681
## Mean :2.608 Mean :2.527 Mean : -Inf Mean :2.721
## 3rd Qu.:2.773 3rd Qu.:2.708 3rd Qu.:3.056 3rd Qu.:2.921
## Max. :3.807 Max. :4.263 Max. :4.248 Max. :4.382
## NA's :997 NA's :776 NA's :187 NA's :829
## Log_MAVG Log_DAVG Log_DAVG2 Log_TDD
## Min. : -Inf Min. :0.4447 Min. : -Inf Min. :-1.609
## 1st Qu.:2.219 1st Qu.:2.0149 1st Qu.:-2.996 1st Qu.: 3.582
## Median :2.473 Median :2.4423 Median :-2.263 Median : 4.046
## Mean : -Inf Mean :2.3736 Mean : -Inf Mean : 4.038
## 3rd Qu.:2.781 3rd Qu.:2.7881 3rd Qu.:-1.561 3rd Qu.: 4.688
## Max. :4.007 Max. :4.3820 Max. : 3.555 Max. : 5.910
## NA's :633 NA's :272 NA's :1174 NA's :281
## Log_DSDI Log_CMA Log_CAA Log_CDA
## Min. :-0.4074 Min. :2.779 Min. :1.148 Min. :2.879
## 1st Qu.:-0.0702 1st Qu.:4.062 1st Qu.:3.156 1st Qu.:4.383
## Median : 0.0000 Median :4.171 Median :3.298 Median :4.467
## Mean : 0.1043 Mean :4.149 Mean :3.274 Mean :4.467
## 3rd Qu.: 0.1104 3rd Qu.:4.270 3rd Qu.:3.438 3rd Qu.:4.547
## Max. : 5.5979 Max. :5.004 Max. :4.308 Max. :5.080
## NA's :730 NA's :432 NA's :427 NA's :428
## Log_MDL Log_DDL Log_...61 Log_CH
## Min. :-2.4493 Min. :-2.6391 Min. : NA Min. :-0.5621
## 1st Qu.:-1.3350 1st Qu.:-1.4469 1st Qu.: NA 1st Qu.: 2.0281
## Median :-0.8755 Median :-1.0986 Median : NA Median : 2.8948
## Mean :-1.0214 Mean :-1.1344 Mean :NaN Mean : 2.8269
## 3rd Qu.:-0.6054 3rd Qu.:-0.7056 3rd Qu.: NA 3rd Qu.: 3.7170
## Max. : 0.0619 Max. : 0.1054 Max. : NA Max. : 4.9805
## NA's :584 NA's :187 NA's :1349 NA's :6
## Log_TransvUndu Log_Interdentsulci Log_CTU1
## Min. :-Inf Min. :-Inf Min. : -Inf
## 1st Qu.:-Inf 1st Qu.:-Inf 1st Qu.: -Inf
## Median : 0 Median :-Inf Median :0.6931
## Mean :-Inf Mean :-Inf Mean : -Inf
## 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.:1.0986
## Max. : 0 Max. : 0 Max. :2.3026
## NA's :954 NA's :979 NA's :953
lennn<-(ncol(data1_cleaned)-2)/2
taxa1<-table(data1_cleaned$TaxonToothtype)
data1_cleanedd<-data.frame(taxa1)
data1_cleanedd<-data1_cleanedd[order(data1_cleanedd$Freq, decreasing = T), ]
data1_cleanedd$ID<-1:nrow(data1_cleanedd)
data1_cleanedd$TaxonToothtype<-data1_cleanedd$Var1
data1_cleaned1<-data1_cleanedd[data1_cleanedd$Freq>lennn,]
data1_cleaned<-data1_cleaned[data1_cleaned$TaxonToothtype%in%unique(data1_cleaned1$TaxonToothtype),]
summary(data1_cleaned)
## TaxonToothtype Epoch CBL
## Saurornitholestes lateral:133 Length:931 Min. : 1.40
## Tyrannosaurus lateral :106 Class :character 1st Qu.: 4.74
## Richardoestesia lateral : 45 Mode :character Median :11.25
## Acrocanthosaurus lateral : 42 Mean :15.22
## Majungasaurus lateral : 40 3rd Qu.:22.56
## Pectinodon lateral : 40 Max. :54.50
## (Other) :525
## CBW CBR CHR DC
## Min. : 0.600 Min. :0.2500 Min. :0.400 Min. : 4.50
## 1st Qu.: 2.200 1st Qu.:0.4473 1st Qu.:1.632 1st Qu.:10.00
## Median : 5.800 Median :0.5206 Median :1.903 Median :15.00
## Mean : 9.451 Mean :0.5682 Mean :1.911 Mean :17.40
## 3rd Qu.:14.495 3rd Qu.:0.6531 3rd Qu.:2.172 3rd Qu.:20.25
## Max. :48.600 Max. :2.1840 Max. :3.509 Max. :60.00
##
## DDL CH Log_CBL Log_CBW
## Min. :0.08333 Min. : 2.200 Min. :0.3365 Min. :-0.5108
## 1st Qu.:0.24692 1st Qu.: 8.425 1st Qu.:1.5560 1st Qu.: 0.7885
## Median :0.33333 Median : 18.810 Median :2.4204 Median : 1.7579
## Mean :0.36742 Mean : 30.574 Mean :2.3168 Mean : 1.7015
## 3rd Qu.:0.50000 3rd Qu.: 45.535 3rd Qu.:3.1162 3rd Qu.: 2.6738
## Max. :1.11111 Max. :145.550 Max. :3.9982 Max. : 3.8836
##
## Log_CBR Log_CHR Log_DC Log_DDL
## Min. :-1.3863 Min. :-0.9163 Min. :1.504 Min. :-2.4849
## 1st Qu.:-0.8045 1st Qu.: 0.4897 1st Qu.:2.303 1st Qu.:-1.3987
## Median :-0.6527 Median : 0.6436 Median :2.708 Median :-1.0986
## Mean :-0.6141 Mean : 0.6248 Mean :2.725 Mean :-1.1153
## 3rd Qu.:-0.4261 3rd Qu.: 0.7755 3rd Qu.:3.008 3rd Qu.:-0.6931
## Max. : 0.7812 Max. : 1.2554 Max. :4.094 Max. : 0.1054
##
## Log_CH
## Min. :0.7885
## 1st Qu.:2.1312
## Median :2.9344
## Mean :2.9493
## 3rd Qu.:3.8185
## Max. :4.9805
##
# Select variables that contain "log" and the first column
selected_cols <- c(1,2, grep("Log", names(data1_cleaned)))
# Subset the data frame
data_log <- data1_cleaned[, selected_cols]
data_log
names(data_log)[-1] <- gsub(" ", "_", names(data_log)[-1] )
# Identify columns that contain "log"
log_cols <- grep("Log", names(data1_cleaned))
# Include the first column
cols_to_keep <- setdiff(1:ncol(data1_cleaned), log_cols)
# Ensure the first column is included
cols_to_keep <- union(1, cols_to_keep)
# Subset the data frame
data_original <- data1_cleaned[, cols_to_keep]
data_original
correlation_matrix <- cor(data_original[, -c(1, 2)])
# Plot correlation matrix
corrplot(correlation_matrix, method = "color", type = "lower",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
diag = FALSE,
order = "hclust",
col = colorRampPalette(c("blue", "white", "red"))(200))
correlation_matrix <- cor(data_log[, -c(1, 2)])
# Plot correlation matrix
corrplot(correlation_matrix, method = "color", type = "lower",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
diag = FALSE,
order = "hclust",
col = colorRampPalette(c("blue", "white", "red"))(200))
## [1] "TaxonToothtype" "Epoch" "CBL" "CBW"
## [5] "CBR" "CHR" "DC" "DDL"
## [9] "CH"
# Filter out taxa with less than 10 observations
data_log <- data_log %>%
group_by(TaxonToothtype) %>% # Group by 'Taxa' column
filter(n() >= 10) %>% # Keep only groups with 10 or more observations
ungroup() # Ungroup after filtering
# Filter out taxa with less than 10 observations
data_original <- data_original %>%
group_by(TaxonToothtype) %>% # Group by 'Taxa' column
filter(n() >= 10) %>% # Keep only groups with 10 or more observations
ungroup()
# Get the count of each unique value in the column
category_counts <- table(data_log$TaxonToothtype)
# Filter unique values that have more than 0 observations
unique_values <- names(category_counts[category_counts > 0])
# Print the result
#print(unique_values)
data1$TaxonToothtype<-ifelse(!(data1$TaxonToothtype%in%unique_values),data1$`Taxa (Genus)`,data1$TaxonToothtype)
data1<-data1[,-c(1,2,5,4,6:14,16:19)]#until 19
#clade: 4, taxa:2, teethtaxa: 3, cladetteth: 5, epoch:15
#data<-inner_join(dd,data)
#data<-data[!duplicated(data),]
data1$`TransvUndu`<-ifelse(data1$`Transv. Undu.`!=0 & !is.na(data1$`Transv. Undu.`),1,data1$`Transv. Undu.`)
data1$`Interdentsulci`<-ifelse(data1$`Interdent. sulci`!=0 & !is.na(data1$`Interdent. sulci`),1,data1$`Interdent. sulci`)
data1$LAF<-ifelse(data1$LAF=="6-7",6.5,data1$LAF)
data1$CTU1 <- sub(".*?(\\d+).*", "\\1", data1$CTU)
data1<- data1 %>% select(-CTU,-`Interdent. sulci`,-`Transv. Undu.`)
# Convert columns to numeric, then create log-transformed columns
data1 <- data1 %>%
mutate(across(3:ncol(data1), as.numeric)) %>%
mutate(across(3:ncol(data1), log, .names = "Log_{.col}"))
## Warning: There were 17 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(3:ncol(data1), as.numeric)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 16 remaining warnings.
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(3:ncol(data1), log, .names = "Log_{.col}")`.
## Caused by warning:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
data1$TaxonToothtype<-as.factor(data1$TaxonToothtype)
#data1$Epoch<-ifelse(data1$Epoch=="'Middle Cretaceous'","Middle Cretaceous", data1$Epoch)
#data1$Epoch<-as.factor(data1$Epoch)
#data$Taxa<-as.factor(paste0(data$`Taxa (Genus)`,data$Maturity,sep=" "))
data1<-data.frame(data1)
# Count the number of missing values in each column
missing_counts <- colSums(is.na(data1))
# Remove columns with more than 15% missing values
data1_cleaned <- data1[, missing_counts <= nrow(data1)*0.15]
# Remove rows with any NA values
data1_cleaned <- na.omit(data1_cleaned)
data1_cleaned
lennn<-(ncol(data1_cleaned)-2)/2
taxa1<-table(data1_cleaned$TaxonToothtype)
data1_cleanedd<-data.frame(taxa1)
data1_cleanedd<-data1_cleanedd[order(data1_cleanedd$Freq, decreasing = T), ]
data1_cleanedd$ID<-1:nrow(data1_cleanedd)
data1_cleanedd$TaxonToothtype<-data1_cleanedd$Var1
data1_cleaned1<-data1_cleanedd[data1_cleanedd$Freq>lennn,]
data1_cleaned<-data1_cleaned[data1_cleaned$TaxonToothtype%in%unique(data1_cleaned1$TaxonToothtype),]
summary(data1_cleaned)
## TaxonToothtype Epoch CBL
## Saurornitholestes lateral:133 Length:968 Min. : 1.400
## Tyrannosaurus lateral :106 Class :character 1st Qu.: 4.815
## Richardoestesia lateral : 45 Mode :character Median :11.310
## Acrocanthosaurus lateral : 42 Mean :15.179
## Majungasaurus lateral : 40 3rd Qu.:22.348
## Pectinodon lateral : 40 Max. :54.500
## (Other) :562
## CBW CBR CHR DC
## Min. : 0.600 Min. :0.2500 Min. :0.400 Min. : 4.50
## 1st Qu.: 2.200 1st Qu.:0.4489 1st Qu.:1.639 1st Qu.:10.00
## Median : 6.000 Median :0.5243 Median :1.913 Median :15.00
## Mean : 9.408 Mean :0.5710 Mean :1.920 Mean :17.33
## 3rd Qu.:14.402 3rd Qu.:0.6590 3rd Qu.:2.181 3rd Qu.:20.00
## Max. :48.600 Max. :2.1840 Max. :3.509 Max. :60.00
##
## DDL CH Log_CBL Log_CBW
## Min. :0.08333 Min. : 2.20 Min. :0.3365 Min. :-0.5108
## 1st Qu.:0.25000 1st Qu.: 8.50 1st Qu.:1.5717 1st Qu.: 0.7885
## Median :0.33333 Median : 19.84 Median :2.4257 Median : 1.7918
## Mean :0.36710 Mean : 30.54 Mean :2.3218 Mean : 1.7114
## 3rd Qu.:0.50000 3rd Qu.: 45.19 3rd Qu.:3.1067 3rd Qu.: 2.6674
## Max. :1.11111 Max. :145.55 Max. :3.9982 Max. : 3.8836
##
## Log_CBR Log_CHR Log_DC Log_DDL
## Min. :-1.3863 Min. :-0.9163 Min. :1.504 Min. :-2.4849
## 1st Qu.:-0.8010 1st Qu.: 0.4943 1st Qu.:2.303 1st Qu.:-1.3863
## Median :-0.6457 Median : 0.6487 Median :2.708 Median :-1.0986
## Mean :-0.6093 Mean : 0.6297 Mean :2.723 Mean :-1.1136
## 3rd Qu.:-0.4171 3rd Qu.: 0.7799 3rd Qu.:2.996 3rd Qu.:-0.6931
## Max. : 0.7812 Max. : 1.2554 Max. :4.094 Max. : 0.1054
##
## Log_CH
## Min. :0.7885
## 1st Qu.:2.1401
## Median :2.9880
## Mean :2.9590
## 3rd Qu.:3.8108
## Max. :4.9805
##
# Select variables that contain "log" and the first column
selected_cols <- c(1,2, grep("Log", names(data1_cleaned)))
# Subset the data frame
data_log <- data1_cleaned[, selected_cols]
data_log
names(data_log)[-1] <- gsub(" ", "_", names(data_log)[-1] )
# Identify columns that contain "log"
log_cols <- grep("Log", names(data1_cleaned))
# Include the first column
cols_to_keep <- setdiff(1:ncol(data1_cleaned), log_cols)
# Ensure the first column is included
cols_to_keep <- union(1, cols_to_keep)
# Subset the data frame
data_original <- data1_cleaned[, cols_to_keep]
data_original
## [1] "TaxonToothtype" "Epoch" "CBL" "CBW"
## [5] "CBR" "CHR" "DC" "DDL"
## [9] "CH"
# Filter out taxa with less than 10 observations
data_log <- data_log %>%
group_by(TaxonToothtype) %>% # Group by 'Taxa' column
filter(n() >= 10) %>% # Keep only groups with 10 or more observations
ungroup() # Ungroup after filtering
# Filter out taxa with less than 10 observations
data_original <- data_original %>%
group_by(TaxonToothtype) %>% # Group by 'Taxa' column
filter(n() >= 10) %>% # Keep only groups with 10 or more observations
ungroup()
# Get the count of each unique value in the column
category_counts <- table(data_log$TaxonToothtype)
# Filter unique values that have more than 0 observations
unique_values1 <- names(category_counts[category_counts > 0])