Read in data sets for BLAST annotation, PFAM annotation and I-TASSER output metrics
GeneDesc <- read_tsv("../data/GiardiaDB-39_GintestinalisAssemblageA_AnnotatedProteins.description",
col_names = FALSE, col_types = cols())
names(GeneDesc) <- c('GeneID','Description')
metrics_df <- read.delim("../data/metrics_clean.txt",sep="\t", stringsAsFactors = FALSE, quote="", header=TRUE)
mismatch_BLAST <- read_tsv("../data/mismatches_BLAST.tsv", col_types = cols())
PDB_pfam <- read_tsv("../data/hmmer_pdb_all.txt", col_types = cols()) #accessed in Oct 2017
|======= | 7% 1 MB
|======== | 7% 1 MB
|======== | 8% 1 MB
|========= | 8% 1 MB
|========== | 9% 1 MB
|========== | 9% 1 MB
|=========== | 10% 1 MB
|=========== | 10% 1 MB
|============ | 11% 1 MB
|============ | 11% 2 MB
|============= | 12% 2 MB
|============= | 12% 2 MB
|============== | 13% 2 MB
|============== | 13% 2 MB
|=============== | 14% 2 MB
|=============== | 14% 2 MB
|================ | 14% 2 MB
|================ | 15% 2 MB
|================= | 15% 2 MB
|================= | 16% 2 MB
|================== | 16% 3 MB
|================== | 17% 3 MB
|=================== | 17% 3 MB
|==================== | 18% 3 MB
|==================== | 18% 3 MB
|===================== | 19% 3 MB
|===================== | 19% 3 MB
|====================== | 20% 3 MB
|====================== | 20% 3 MB
|======================= | 21% 3 MB
|======================= | 21% 3 MB
|======================== | 22% 3 MB
|======================== | 22% 4 MB
|========================= | 23% 4 MB
|========================= | 23% 4 MB
|========================== | 24% 4 MB
|========================== | 24% 4 MB
|=========================== | 25% 4 MB
|=========================== | 25% 4 MB
|============================ | 26% 4 MB
|============================ | 26% 4 MB
|============================= | 27% 4 MB
|============================== | 27% 4 MB
|============================== | 28% 4 MB
|=============================== | 28% 5 MB
|=============================== | 29% 5 MB
|================================ | 29% 5 MB
|================================ | 29% 5 MB
|================================= | 30% 5 MB
|================================= | 30% 5 MB
|================================== | 31% 5 MB
|================================== | 31% 5 MB
|=================================== | 32% 5 MB
|=================================== | 32% 5 MB
|==================================== | 33% 5 MB
|==================================== | 33% 6 MB
|===================================== | 34% 6 MB
|===================================== | 34% 6 MB
|====================================== | 35% 6 MB
|====================================== | 35% 6 MB
|======================================= | 36% 6 MB
|======================================== | 36% 6 MB
|======================================== | 37% 6 MB
|========================================= | 37% 6 MB
|========================================= | 38% 6 MB
|========================================== | 38% 6 MB
|========================================== | 39% 6 MB
|=========================================== | 39% 7 MB
|=========================================== | 40% 7 MB
|============================================ | 40% 7 MB
|============================================ | 41% 7 MB
|============================================= | 41% 7 MB
|============================================= | 42% 7 MB
|============================================== | 42% 7 MB
|============================================== | 43% 7 MB
|=============================================== | 43% 7 MB
|=============================================== | 43% 7 MB
|================================================ | 44% 7 MB
|================================================ | 44% 7 MB
|================================================= | 45% 8 MB
|================================================== | 45% 8 MB
|================================================== | 46% 8 MB
|=================================================== | 46% 8 MB
|=================================================== | 47% 8 MB
|==================================================== | 47% 8 MB
|==================================================== | 48% 8 MB
|===================================================== | 48% 8 MB
|===================================================== | 49% 8 MB
|====================================================== | 49% 8 MB
|====================================================== | 50% 8 MB
|======================================================= | 50% 9 MB
|======================================================= | 51% 9 MB
|======================================================== | 51% 9 MB
|======================================================== | 52% 9 MB
|========================================================= | 52% 9 MB
|========================================================= | 53% 9 MB
|========================================================== | 53% 9 MB
|=========================================================== | 54% 9 MB
|=========================================================== | 54% 9 MB
|============================================================ | 55% 9 MB
|============================================================ | 55% 9 MB
|============================================================= | 56% 9 MB
|============================================================= | 56% 10 MB
|============================================================== | 57% 10 MB
|============================================================== | 57% 10 MB
|=============================================================== | 57% 10 MB
|=============================================================== | 58% 10 MB
|================================================================ | 58% 10 MB
|================================================================ | 59% 10 MB
|================================================================= | 59% 10 MB
|================================================================= | 60% 10 MB
|================================================================== | 60% 10 MB
|================================================================== | 61% 10 MB
|=================================================================== | 61% 10 MB
|=================================================================== | 62% 11 MB
|==================================================================== | 62% 11 MB
|===================================================================== | 63% 11 MB
|===================================================================== | 63% 11 MB
|====================================================================== | 64% 11 MB
|====================================================================== | 64% 11 MB
|======================================================================= | 65% 11 MB
|======================================================================= | 65% 11 MB
|======================================================================== | 66% 11 MB
|======================================================================== | 66% 11 MB
|========================================================================= | 67% 11 MB
|========================================================================= | 67% 12 MB
|========================================================================== | 68% 12 MB
|========================================================================== | 68% 12 MB
|=========================================================================== | 69% 12 MB
|=========================================================================== | 69% 12 MB
|============================================================================ | 70% 12 MB
|============================================================================ | 70% 12 MB
|============================================================================= | 71% 12 MB
|============================================================================= | 71% 12 MB
|============================================================================== | 71% 12 MB
|============================================================================== | 72% 12 MB
|=============================================================================== | 72% 12 MB
|================================================================================ | 73% 13 MB
|================================================================================ | 73% 13 MB
|================================================================================= | 74% 13 MB
|================================================================================= | 74% 13 MB
|================================================================================== | 75% 13 MB
|================================================================================== | 75% 13 MB
|=================================================================================== | 76% 13 MB
|=================================================================================== | 76% 13 MB
|==================================================================================== | 77% 13 MB
|==================================================================================== | 77% 13 MB
|===================================================================================== | 78% 13 MB
|===================================================================================== | 78% 13 MB
|====================================================================================== | 79% 14 MB
|====================================================================================== | 79% 14 MB
|======================================================================================= | 80% 14 MB
|======================================================================================= | 80% 14 MB
|======================================================================================== | 81% 14 MB
|========================================================================================= | 81% 14 MB
|========================================================================================= | 82% 14 MB
|========================================================================================== | 82% 14 MB
|========================================================================================== | 83% 14 MB
|=========================================================================================== | 83% 14 MB
|=========================================================================================== | 84% 14 MB
|============================================================================================ | 84% 15 MB
|============================================================================================ | 85% 15 MB
|============================================================================================= | 85% 15 MB
|============================================================================================= | 85% 15 MB
|============================================================================================== | 86% 15 MB
|============================================================================================== | 86% 15 MB
|=============================================================================================== | 87% 15 MB
|=============================================================================================== | 87% 15 MB
|================================================================================================ | 88% 15 MB
|================================================================================================ | 88% 15 MB
|================================================================================================= | 89% 15 MB
|================================================================================================= | 89% 15 MB
|================================================================================================== | 90% 16 MB
|=================================================================================================== | 90% 16 MB
|=================================================================================================== | 91% 16 MB
|==================================================================================================== | 91% 16 MB
|==================================================================================================== | 92% 16 MB
|===================================================================================================== | 92% 16 MB
|===================================================================================================== | 93% 16 MB
|====================================================================================================== | 93% 16 MB
|====================================================================================================== | 94% 16 MB
|======================================================================================================= | 94% 16 MB
|======================================================================================================= | 95% 16 MB
|======================================================================================================== | 95% 16 MB
|======================================================================================================== | 96% 17 MB
|========================================================================================================= | 96% 17 MB
|========================================================================================================= | 97% 17 MB
|========================================================================================================== | 97% 17 MB
|========================================================================================================== | 98% 17 MB
|=========================================================================================================== | 98% 17 MB
|============================================================================================================| 99% 17 MB
|============================================================================================================| 99% 17 MB
|=============================================================================================================| 100% 17 MB
PDB_pfam <- PDB_pfam %>% mutate(code_chain=paste0(PDB_ID,CHAIN_ID)) %>%
separate(PFAM_ACC, into=c('PFAMpref','PFAMsuff'))
###Giardia PFAM domains
#GDB_IPS <- read_tsv("http://giardiadb.org/common/downloads/release-39/GintestinalisAssemblageAWB/txt/GiardiaDB-39_GintestinalisAssemblageAWB_InterproDomains.txt", col_names = FALSE)
GDB_IPS <- read_tsv("../data/GiardiaDB-39_GintestinalisAssemblageAWB_InterproDomains.txt",
col_types = cols(), col_names = FALSE)
names(GDB_IPS) <- c('GeneID','source','InterproID','Desc','GDBpfam_start','GDBpfam_end','eVal')
GDB_pfam <- GDB_IPS %>% filter(str_detect(InterproID,'^PF')) %>%
dplyr::rename(PFAMpref=InterproID) %>% distinct()
PFAM_descriptionTable <- read_tsv("../data/PFAM_descriptionMapping.tsv",col_names = TRUE, col_types = cols())
Metrics derived from I-TASSER output: sd of secondary structure predictions, and query:reference length ratio.
metrics_all <- metrics_df %>%
mutate(lenRatio=seq.ssLen/PDB_AA) %>%
mutate(Hprop=nHrc/seq.ssLen,
Eprop=nErc/seq.ssLen,
Cprop=nCrc/seq.ssLen) %>%
rowwise() %>% mutate(SS_sd = sd(c(Hprop, Eprop, Cprop)))
Investigate discordant reference structure matches for Giardia peptides encoding solved structures
mismatch_BLAST_status <- mismatch_BLAST %>%
mutate(matchStatus = factor(case_when(str_detect(status,'miss') ~ 'Matched_other',
str_detect(status,'hit') ~ 'Matched_Gd'))) %>%
mutate(refSpecies = factor(case_when(str_detect(status,'hit') ~ 'Gd',
str_detect(status,'non-Giardia') ~ 'other',
TRUE ~ 'Gd'))) %>%
select(GeneID, matchStatus,refSpecies,Ref_Query_AAlenRatio,bit_score)
mismatch_BLAST_status %>%
rename(`Reference species` = refSpecies,
`Match status` = matchStatus,
`AA length ratio` = Ref_Query_AAlenRatio,
'Bit score' = bit_score) %>%
gather(key,value,-c(GeneID,`Match status`,`Reference species`)) %>%
mutate(xSep=paste0(`Match status`,'_vs_',`Reference species`)) %>%
ggplot(aes(x=xSep, y=value)) +
geom_boxplot(aes(col=`Match status`), alpha=0) +
geom_jitter(aes(col=`Match status`,shape=`Reference species`), width=0.1, size=2) +
facet_wrap(~ key, scales="free") +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
xlab("")
ggsave("../results/SF1.pdf",width=8,height=4)
Test differences in BLAST bit scores, and query:reference AA length ratios
mismatch_BLAST_forTest <- mismatch_BLAST_status %>% unite(match_species, c(matchStatus,refSpecies))
#Test differences in Ref_Query_AAlenRatio
aov_lenRat.res <- aov(Ref_Query_AAlenRatio ~ match_species, data = mismatch_BLAST_forTest )
summary(aov_lenRat.res)
Df Sum Sq Mean Sq F value Pr(>F)
match_species 2 0.01126 0.005628 6.39 0.00649 **
Residuals 22 0.01938 0.000881
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Multiple pair-wise comparisons
TukeyHSD(aov_lenRat.res)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = Ref_Query_AAlenRatio ~ match_species, data = mismatch_BLAST_forTest)
$match_species
diff lwr upr p adj
Matched_other_Gd-Matched_Gd_Gd -0.0531666 -0.091663990 -0.01466921 0.0059271
Matched_other_other-Matched_Gd_Gd -0.0004910 -0.038988390 0.03800639 0.9994342
Matched_other_other-Matched_other_Gd 0.0526756 0.005526119 0.09982508 0.0267095
#Test differences in bit_score
aov_bitScore.res <- aov(bit_score ~ match_species, data = mismatch_BLAST_forTest )
summary(aov_bitScore.res)
Df Sum Sq Mean Sq F value Pr(>F)
match_species 2 708405 354202 5.403 0.0123 *
Residuals 22 1442156 65553
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Multiple pair-wise comparisons
TukeyHSD(aov_bitScore.res)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = bit_score ~ match_species, data = mismatch_BLAST_forTest)
$match_species
diff lwr upr p adj
Matched_other_Gd-Matched_Gd_Gd -179.2 -511.3312 152.93122 0.3808485
Matched_other_other-Matched_Gd_Gd -428.2 -760.3312 -96.06878 0.0101240
Matched_other_other-Matched_other_Gd -249.0 -655.7760 157.77601 0.2933180
Join PFAM annotation databases and identify query:reference pairs with at least 1 matching PFAM code.
Annotation: PFAM codes assigned to Giardia query peptides: ‘GDB_pfam’
PFAM codes assigned to peptides encoding PDB reference structures: ‘PDB_pfam’.
metrics_long <- metrics_all %>%
mutate(code_chain=toupper(codechain)) %>%
left_join(GDB_pfam,by="GeneID") %>% rename(GDB_PFAMpref=PFAMpref) %>%
left_join(PDB_pfam,by="code_chain") %>% rename(PDB_PFAMpref=PFAMpref) %>%
distinct() %>% rename(GDB_PFAM_desc = Desc, PDB_PFAM_desc = PFAM_desc)
Plot number of PFAM domains per query:reference pair
nPFAM_Q_R <- metrics_long %>%
select(GeneID, GDB_PFAMpref, PDB_PFAMpref) %>%
distinct() %>%
group_by(GeneID) %>%
summarize_at(vars(GDB_PFAMpref, PDB_PFAMpref), funs( . %>% na.omit() %>% n_distinct )) %>%
rename(nPFAM_GDB = GDB_PFAMpref, nPFAM_PDB = PDB_PFAMpref )
nPFAM_Q_R %>% gather(key,value,-GeneID) %>% ggplot(aes(value)) +
geom_bar(aes(fill=key), position="dodge", show.legend = TRUE)
Calculate n. matching domains
#Because all pairwise combinations of PFAM domains are listed, need only filter for exact match:
nMatch_count <- metrics_long %>%
select(GeneID,code,chain,code_chain,GDB_PFAMpref,PDB_PFAMpref) %>%
distinct() %>%
group_by(GeneID) %>%
filter(GDB_PFAMpref==PDB_PFAMpref) %>%
summarize(n_Match = n())
nMatch <- metrics_all[ , "GeneID"] %>%
left_join(nMatch_count, by="GeneID") %>%
mutate(n_Match = ifelse(is.na(n_Match), 0 ,n_Match))
Summarize PFAM matches across annotation groups
metrics_nMatch <- metrics_all %>%
left_join(nPFAM_Q_R, by="GeneID") %>%
left_join(nMatch,by="GeneID") %>%
mutate(n_Match=ifelse(is.na(n_Match), 0, n_Match)) %>%
mutate(matchStatus=ifelse(n_Match==0,'noMatch','exactMatch')) %>%
left_join(GeneDesc,by="GeneID") %>% select(GeneID,Description,everything()) %>%
mutate(descBin = ifelse(str_detect(Description,'hypothetical') | str_detect(Description,'Hypothetical'), 'Hyp','Annot'))
metrics_nMatch %>% count(descBin,matchStatus)
Relative peptide length and number of matching PFAM domains.
lenRatio_X_nMatch <- metrics_nMatch %>%
select(GeneID, Cov, GDB_AA, lenRatio, contains('nPFAM'),n_Match, descBin) %>%
mutate(`n_Match:nPFAM_PDB` = n_Match/nPFAM_PDB) %>%
na.omit() %>%
group_by(n_Match, nPFAM_PDB) %>% summarize(median_lenRatio = median(lenRatio),
mean_lenRatio = mean(lenRatio),
mean_GDB_AAlen = mean(GDB_AA),
median_GDB_AAlen = median(GDB_AA),
group_size = n()) %>%
arrange(n_Match, nPFAM_PDB, desc(group_size))
metrics_nMatch %>%
filter(nPFAM_PDB<=4) %>% #filter(n_Match==0) %>%
filter(nPFAM_PDB>0) %>%
ggplot() +
geom_boxplot(aes(x=n_Match , y=lenRatio, group=factor(n_Match)), alpha=0, width=0.5)+
geom_jitter(aes(x=n_Match , y=lenRatio, col=factor(nPFAM_PDB),
group=factor(nPFAM_PDB)),
position=position_jitterdodge(dodge.width = 0.85), alpha=0.5, cex=0.25) +
geom_text(data=lenRatio_X_nMatch %>%
filter(n_Match<=4, nPFAM_PDB<=4),
aes(x=n_Match, y=3, label=group_size, group=nPFAM_PDB), size=3) +
facet_wrap( ~ nPFAM_PDB) +
theme(legend.position = "none") +
ggtitle("N. matching PFAM terms vs query:reference AA length ratio",
subtitle="Facet by n. PFAM terms in PDB reference") +
xlab("N. matching PFAM domains") + ylab("Query:reference AA length ratio")
ggsave("../results/SF2.pdf",width=6,height=6)
Plot intersection of BLAST annotation status and PFAM code annotations.
source("upset.R")
upSets <- metrics_nMatch %>% select(GeneID, descBin, nPFAM_GDB, nPFAM_PDB, n_Match) %>%
mutate_at(vars(-GeneID, -descBin) , funs(binary = ifelse(. > 0, 1, 0))) %>%
mutate(descCategory = ifelse(descBin=="Hyp",1,2)) %>%
distinct() %>% select(GeneID, descCategory, contains('binary')) %>%
rename(Query = nPFAM_GDB_binary, Reference = nPFAM_PDB_binary, Match = n_Match_binary)
upSets %>% count(descCategory, Query, Reference, Match)
annotStatus <- function(row, min, max){
newData <- (row["descCategory"] <= max) & (row["descCategory"] > min)}
upset(data.frame(upSets),
sets=c('Query','Reference','Match'),
main.bar.color = 'black',
queries = list(list(query = annotStatus, params = list(1,2),
color="light blue",active=TRUE)), point.size=3)
pdf("../results/F2a.pdf", width = 4,height=3, onefile = FALSE) #works..ish
#pdf("../results/F2a.pdf", width = 4.5,height=3.5, onefile = FALSE)
upset(data.frame(upSets),
sets=c('Query','Reference','Match'),
main.bar.color = 'black',
queries = list(list(query = annotStatus, params = list(1,2),
color="light blue",active=TRUE)), point.size=3)
dev.off()
quartz_off_screen
2
PFAM term enrichment testing
source("rowwise_fisher.R")
matchStatus_PFAMcounts <- metrics_long %>%
left_join(metrics_nMatch %>% select(GeneID, matchStatus), by="GeneID") %>%
dplyr::select(1,lenRatio, matchStatus, GDB_PFAMpref,PDB_PFAMpref) %>%
gather(key=DB,value=PFAM,-c(GeneID,matchStatus,lenRatio)) %>%
na.omit() %>%
filter(lenRatio >0.9 & lenRatio <1.1) %>% ungroup() %>%
count(matchStatus,DB,PFAM) %>%
spread(key=DB, value=n, fill = 0)
#set up parameters for Fisher tables
myStatus <- "exactMatch"
minFreq <- 3
Foreground <- 'PDB_PFAMpref'
background <- 'GDB_PFAMpref'
termSums <- matchStatus_PFAMcounts %>%
filter(get(Foreground) >= minFreq) %>%
filter(matchStatus == myStatus) %>%
summarize(allForeground = sum(get(Foreground)),
allBackground = sum(get(background)),
allTerms = sum(get(Foreground),get(background))) %>% unlist()
Fisher_df <- matchStatus_PFAMcounts %>%
filter(matchStatus==myStatus) %>%
dplyr::rename(Foreground = PDB_PFAMpref,
BACKGROUND = GDB_PFAMpref) %>%
rowwise() %>%
mutate(sumTerm=sum(Foreground,BACKGROUND),
allForeground=termSums[[1]],
allBackground=termSums[[2]],
allTerms=termSums[[3]]) %>%
mutate('a' = Foreground,
'b' = sumTerm - Foreground,
'c' = allForeground - Foreground,
'd' = allTerms - allForeground - b) %>%
dplyr::select(PFAM,Foreground,BACKGROUND,everything()) %>%
ungroup()
mydata <- Fisher_df
p <- t(apply(mydata %>%
dplyr::select(a,b,c,d) %>%
filter(a >= minFreq), 1, row_fisher))
results <- cbind(mydata %>% filter(a >= minFreq) %>%
dplyr::select(1:7), p) %>%
#1:7 as no background of repressed, lowly transcribed (<0.85 percentile; i.e., the equivalent of the nonDTG set), is included.
arrange(p_val) %>%
mutate(adjP=p.adjust(.$p_val, method="BH" )) #correction for multiple comparisons
#No significant enrichment
results %>% filter(p_val < 0.05) %>%
arrange(p_val) %>%
filter(matchStatus=='exactMatch') %>%
mutate(`Reference_PFAM` = Foreground/allForeground,
`Query_PFAM` = BACKGROUND/allBackground) %>%
left_join(PDB_pfam %>% select(PFAMpref,PFAM_Name,PFAM_desc), by=c("PFAM" = "PFAMpref")) %>%
distinct() %>%
dplyr::select(Foreground,BACKGROUND, PFAM,PFAM_desc,Query_PFAM,Reference_PFAM) %>%
filter(Foreground >= 7) %>% select(-c(Foreground,BACKGROUND)) %>%
gather(key,value,-c(PFAM,PFAM_desc)) %>%
ggplot(aes(x=reorder(PFAM_desc,value,FUN=max),y=value,fill=key)) +
geom_bar(stat="identity",position="dodge", show.legend = TRUE) +
coord_flip() +
xlab("") + ylab("Proportion of PFAM terms") +
labs(fill="")
ggsave("../results/F2b.pdf",width=8,height=3.5)
EF hand domains in NEK kinases
metrics_nMatch %>% filter(n_Match > 0, str_detect(Description,"NEK")) %>%
filter(nPFAM_PDB > n_Match) %>%
filter(code=="3HX4") %>%
left_join(metrics_long,by="GeneID") %>%
select(GeneID, n_Match, contains("PFAMpref"), contains("PFAM_desc")) %>%
distinct() %>%
filter(PDB_PFAMpref=="PF13499")
Kinase domains hypothetical proteins
metrics_nMatch %>%
filter(descBin=="Hyp") %>%
filter(nPFAM_PDB > n_Match) %>%
filter(code=="4O1O") %>%
left_join(metrics_long,by="GeneID") %>%
group_by(GeneID) %>% filter(!str_detect(GDB_PFAM_desc,'kinase')) %>%
select(GeneID, n_Match, contains("PFAMpref"), contains("PFAM_desc")) %>%
distinct() %>% count(GeneID)
NA
Plot n. unique PFAM codes available via query and reference peptides.
metrics_nMatch %>% select(GeneID, contains('nPFAM'), n_Match, matchStatus) %>%
filter(matchStatus=="exactMatch") %>%
select(contains('nPFAM')) %>%
gather(key=source,value=nPFAM) %>%
ggplot(aes(x=nPFAM)) + geom_bar(aes(fill=source), position="dodge") +
xlab("N. unique PFAM codes") + ylab("N. proteins")
metrics_nMatch %>% select(GeneID, contains('nPFAM'), n_Match, matchStatus) %>%
filter(matchStatus=="exactMatch") %>%
select(contains('nPFAM')) %>%
gather(key=source,value=nPFAM) %>%
group_by(source) %>% summarize(mean_nPFAM=mean(nPFAM))
ggsave("../results/F2c.pdf",width=5,height=4)
Create training and test data
Gd_forRF <- metrics_nMatch %>% select(1,TM:RMSD_model_sd,lenRatio,SS_sd,matchStatus) %>% na.omit()
#Gd_forRF %>% str()
geneOrder <- read_tsv("../data/Gd_forRF_geneIDorder.tsv")
Parsed with column specification:
cols(
GeneID = col_character()
)
Gd_forRF <- geneOrder %>% left_join(Gd_forRF, by="GeneID") #original geneOrder
set.seed(1234) ; forTRAIN_exact <- Gd_forRF %>% filter(matchStatus=="exactMatch") %>% sample_n(750);
set.seed(1234) ; forTRAIN_noMatch <- Gd_forRF %>% filter(matchStatus!="exactMatch") %>% sample_n(750);
forTRAIN <- rbind(forTRAIN_exact, forTRAIN_noMatch)
#set.seed(1234); TEST <- Gd_forRF %>% anti_join(forTRAIN, by="GeneID") %>% sample_n(1500) ;
set.seed(4321); forTEST_exact <- Gd_forRF %>% anti_join(forTRAIN_exact, by="GeneID") %>% filter(matchStatus=="exactMatch") %>% sample_n(250)
set.seed(4321); forTEST_noMatch <- Gd_forRF %>% anti_join(forTRAIN_noMatch, by="GeneID") %>% filter(matchStatus!="exactMatch") %>% sample_n(250)
TEST <- rbind(forTEST_exact, forTEST_noMatch)
TRAIN <- forTRAIN %>% dplyr::select(-c(GeneID))
Model training is only run once.
rf_model <- train(matchStatus ~. , data=TRAIN, method="rf",
trControl = trainControl(method="cv", number=5, verboseIter=TRUE),
prox=TRUE, verbose=TRUE)
saveRDS(rf_model,"../data/rf_model.Rds")
Print final model
rf_model <- readRDS("../data/rf_model.Rds")
print(rf_model$finalModel)
Call:
randomForest(x = x, y = y, mtry = param$mtry, proximity = TRUE, verbose = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 9.93%
Confusion matrix:
exactMatch noMatch class.error
exactMatch 694 56 0.07466667
noMatch 93 657 0.12400000
predTEST <- predict(rf_model, newdata=TEST, type="prob")
predALL <- predict(rf_model, newdata=Gd_forRF, type="prob")
#summary(predTEST)
#summary(predALL)
#accuracy on hold-out TEST set:
TEST %>% mutate(RFexact_pred = predTEST$exactMatch) %>%
mutate(predStatus=ifelse(RFexact_pred >= 0.5,"exactMatch","noMatch")) %>%
count(matchStatus, predStatus)
#entire proteome
Gd_forRF %>% mutate(RFexact_pred = predALL$exactMatch) %>%
mutate(predStatus=ifelse(RFexact_pred >= 0.5,"exactMatch","noMatch")) %>%
count(matchStatus, predStatus)
Gd_matchPredict <- Gd_forRF %>% mutate(exactMatchPred = predALL$exactMatch) %>% ungroup()
Gd_imp <- data.frame(varImp(rf_model, scale=FALSE)$importance)
Gd_imp$noi <- row.names(Gd_imp); row.names(Gd_imp) <- NULL
names(Gd_imp) <- c('Contribn', 'impFactor') ; #head(Gd_imp)
Gd_imp <- Gd_imp %>%dplyr::select(2,1) %>% arrange(desc(Contribn)) %>% mutate(propCont = Contribn/sum(Contribn))
imp_plot <- Gd_imp %>%
mutate(impFactor_recode = case_when(impFactor=="AApc" ~ 'AA_%ID',
impFactor=="Cscore" ~ 'C_score',
impFactor=="AApc" ~ "AA_%ID",
impFactor=="Cov" ~ 'Coverage',
impFactor=="exactMatchPred" ~ 'Exact_match_prediction',
impFactor=="lenRatio" ~ 'Length_ratio',
TRUE ~ impFactor)) %>%
ggplot(aes(x=reorder(impFactor_recode, propCont), y=propCont)) +
geom_bar(stat="identity", aes(fill=impFactor_recode), show.legend = FALSE) +
ylab("Importance") + xlab("") + ylim(0,0.3) +
coord_flip()
imp_plot
ggsave("../results/SF3a.pdf",imp_plot, width=4,height=3)
##AUROC
#https://gist.github.com/jwaage/6d8f4eb096e4f18a0894ca1ce27af834
forROC <- Gd_matchPredict %>%
dplyr::select(TM:exactMatchPred) %>%
mutate(matchStatus = abs(as.numeric(factor(matchStatus))-2))
aucOut <- data_frame('sensPlt' = NA, 'specPlt' = NA, 'Metric'=NA,'AUC' = NA)
for (i in names(forROC)){#print(i)
r <- roc(forROC$matchStatus, forROC[ , i] %>% pull())
sens <- r$sensitivities
spec <- r$specificities
a <- auc(r)[1]
rnd <- round(a,3)
result <- data_frame('sensPlt' = rev(sens), 'specPlt' = rev(spec)) %>% mutate(Metric = i, AUC = rnd)
aucOut <- rbind(aucOut,result) %>% na.omit() }
auc_plot <- aucOut %>% mutate(Metric = case_when(Metric=="AApc" ~ 'AA_%ID',
Metric=="Cscore" ~ 'C_score',
Metric=="AApc" ~ "AA_%ID",
Metric=="Cov" ~ 'Coverage',
Metric=="exactMatchPred" ~ 'Exact_match_prediction',
Metric=="lenRatio" ~ 'AA_length_ratio',
TRUE ~ Metric)) %>%
filter(Metric !="matchStatus") %>%
distinct() %>% na.omit() %>% filter(AUC >= 0.7) %>%
ggplot(aes(x=specPlt,y=sensPlt)) +
geom_segment(aes(x = 0, y = 1, xend = 1, yend = 0), alpha = 0.5) +
geom_step(aes(col=Metric), lty = 2,lwd=1) +
scale_x_reverse(name = "False positive rate (Specificity)",limits = c(1,0)) +
ylab("True positive rate (Sensitivity)") +
coord_equal() +
scale_color_manual(values= c("AA_%ID" = "#F8766D","C_score" = "#DE8C00",
"Exact_match_prediction" = 'black','AA_length_ratio' = "#00BA38",
"RMSD" = "#00BFC4","TM_model" = "#F564E3"))
aucOut %>% select(-contains('Plt')) %>% distinct() %>% arrange(desc(AUC))
auc_plot
ggsave("../results/F3b.pdf",auc_plot, width=5,height=5)
Gd_all <- metrics_nMatch %>% left_join(Gd_matchPredict %>% dplyr::select(1,exactMatchPred),by="GeneID")
Gd_all_RFconf <- Gd_all %>%
mutate(RF_confidence = case_when(matchStatus=="exactMatch" & exactMatchPred >= 0.5 ~ 'HiConf',
matchStatus=="noMatch" & exactMatchPred < 0.5 ~ 'LowerConf',
matchStatus=="exactMatch" & exactMatchPred < 0.5 ~ "LowerConf-like",
matchStatus=="noMatch" & exactMatchPred >= 0.5 ~ "HiConf-like",
TRUE ~ ""))
Gd_all_RFconf %>% select(TM:RMSD_model_sd,lenRatio,SS_sd,matchStatus,exactMatchPred,RF_confidence) %>%
rename(Exact_match_prediction = exactMatchPred, AA_length_ratio = lenRatio,
`AA_%ID` = AApc, Coverage = Cov, C_score = Cscore) %>%
filter(C_score > -8 & RMSD_model < 25) %>%
gather(key,value,-c(RF_confidence,matchStatus)) %>%
filter(str_detect(RF_confidence,"Conf")) %>%
na.omit() %>%
mutate(key = fct_relevel(factor(key), c('SS_sd', 'C_score','C_sd', 'AA_length_ratio',
'TM','RMSD','AA_%ID','Coverage',
'TM_model','TM_model_sd','RMSD_model','RMSD_model_sd',
'Exact_match_prediction'))) %>%
ggplot(aes(x=key, y=value, col=RF_confidence)) +
geom_point(position=position_jitterdodge(jitter.width = 0.1, dodge.width=1),
size=0.02, alpha=0.2, show.legend = TRUE) +
geom_boxplot( position=position_dodge(1), alpha=0) +
xlab("") + ylab("") +
theme(legend.title=element_blank(),
strip.background = element_blank(),
strip.text.x = element_text(size=0)) +
facet_wrap( ~ key, scales="free")
ggsave("../results/F4.pdf",width=9.5,height=9)
Technical variation: Train 500 models on the same data.
RF_train_iter500 <- data_frame('dummy'=rep(NA, nrow(Gd_forRF)))
for(i in 1:500){
a <- train(matchStatus ~. , data=TRAIN, method="rf",
trControl = trainControl(method="cv", number=5, verboseIter=TRUE),
prox=TRUE, verbose=TRUE)
fm <- as_data_frame(predict(a, newdata = Gd_forRF, type="prob"))
fm <- fm[ , 1]
names(fm) <- paste0('rf_',i)
RF_train_iter500 <- cbind(RF_train_iter500, fm)
}
RF_train_iter500 <- RF_train_iter500[ , -1]
saveRDS(RF_train_iter500, "../data/RF_train_iter500.Rds")
Plot technical variation
RF_train_iter500 <- readRDS("../data/RF_train_iter500.Rds")
RF_train_iter500_summary <- data_frame('mean' = rowMeans(RF_train_iter500),
'sd' = apply(RF_train_iter500 ,1, sd, na.rm = TRUE),
'sem' = apply(RF_train_iter500 , 1, function(x) sd(x)/sqrt(length(x))),
'len' = apply(RF_train_iter500 , 1, length ))
RF_train_iter500_summary %>% cbind(Gd_forRF) %>% dplyr::select(mean,sd,sem,matchStatus) %>%
ggplot(aes(mean, log10(1/sd))) + geom_point(aes(col=matchStatus), cex=0.5) +
ylim(1,3.5)
ggsave("../results/SF6a.pdf",width=5,height=4)
Robustness: Train 50 models on different training data and set sizes
RF_sample_iter500 <- data_frame("GeneID" = Gd_forRF %>% select(GeneID) %>% pull())
for(i in c(50,100,200,300,400,500)){
for(j in 1:50){
sample_iterTrain <- rbind(Gd_forRF %>% filter(matchStatus=='noMatch') %>% sample_n(i),
Gd_forRF %>% filter(matchStatus=='exactMatch') %>% sample_n(i))
rfMod_iter <- train(matchStatus ~ . , data = sample_iterTrain %>% select(-GeneID), method="rf",
trControl = trainControl(method="cv", number=5, verboseIter=TRUE),
prox=TRUE, verbose=TRUE)
fm_t <- as_data_frame(predict(rfMod_iter, newdata=Gd_forRF, type="prob"))
fm_t <- fm_t[ , 1]
names(fm_t) <- paste('rf_train', 2*i, 'iter', j, sep="_")
RF_sample_iter500 <- cbind(RF_sample_iter500, fm_t)
}
}
saveRDS(RF_sample_iter500, "../data/RF_sample_iter500.Rds")
Plot reproducibility
RF_sample_iter500 <- readRDS("../data/RF_sample_iter500.Rds")
RF_sample_iter500 %>%
gather(key,value,-GeneID) %>%
separate(key,into=c("v1","v2","v3","v4","v5"),sep="_",convert = TRUE) %>%
select(GeneID,v3,v5,value) %>% dplyr::rename(trainingSetSize=v3,iteration=v5) %>%
group_by(GeneID,trainingSetSize) %>% summarize(mean=mean(value),sd=sd(value)) %>%
left_join(Gd_forRF %>% select(GeneID,matchStatus),by="GeneID") %>%
ggplot(aes(x=factor(trainingSetSize),y=sd)) +
geom_boxplot(alpha=0.2, aes(col=matchStatus )) +
xlab('Training set size') + ylab("sd(Probability of high-confidence category)")
ggsave("../results/SF6b.pdf",width=7,height=4)
RF_sample_iter500 %>%
gather(key,value,-GeneID) %>%
separate(key,into=c("v1","v2","v3","v4","v5"),sep="_",convert = TRUE) %>%
select(GeneID,v3,v5,value) %>%
dplyr::rename(trainingSetSize=v3,iteration=v5) %>%
group_by(GeneID,trainingSetSize) %>%
mutate(call=ifelse(value>0.5,1,0)) %>%
group_by(GeneID,trainingSetSize) %>%
mutate(meanCall=mean(call), meanVal=mean(value),
sdCall=sd(call), sdVal=sd(value)) %>%
ungroup() %>%
select(-c(value,iteration,call)) %>% distinct() %>%
left_join(Gd_forRF %>% select(GeneID, matchStatus),by="GeneID") %>%
left_join(Gd_all_RFconf %>% select(GeneID,exactMatchPred,RF_confidence),
by="GeneID") %>% filter(RF_confidence!="") %>%
ggplot(aes(x=meanCall,y=meanVal)) +
geom_point(aes(col=RF_confidence),cex=0.2) +
facet_wrap(~ trainingSetSize)
ggsave("../results/SF6c.pdf",width=8,height=4)
Gd_all_RFconf %>% filter(RF_confidence=="LowerConf-like") %>%
left_join(metrics_long,by="GeneID") %>%
filter(PDB_PFAMpref== GDB_PFAMpref) %>%
select(GeneID,GDB_PFAMpref) %>%
distinct() %>%
left_join(PFAM_descriptionTable,by=c('GDB_PFAMpref' = 'PFAMcode')) %>%
count(desc) %>% arrange(desc(n)) %>% filter(n>1)
Gd_all_RFconf %>% filter(RF_confidence=="LowerConf") %>%
left_join(metrics_long,by="GeneID") %>%
select(GeneID,GDB_PFAMpref) %>%
na.omit() %>%
distinct() %>%
left_join(PFAM_descriptionTable,by=c('GDB_PFAMpref' = 'PFAMcode')) %>%
count(desc) %>% arrange(desc(n)) %>% filter(n>5)
Gd_all_RFconf %>%
left_join(metrics_long,by="GeneID") %>%
select(GeneID,GDB_PFAMpref,PDB_PFAMpref, RF_confidence) %>%distinct() %>%
gather(key,value,-c(GeneID,RF_confidence)) %>%
distinct() %>% ungroup() %>%
filter(RF_confidence!="") %>% na.omit() %>%
group_by(RF_confidence,key) %>% count(value) %>%
spread(RF_confidence,n, fill = 0) %>%
filter(key=="GDB_PFAMpref") %>%
rowwise() %>%
mutate(ratio = (LowerConf /1389) / (sum(`HiConf`,`HiConf-like`,`LowerConf-like`) / 1389)) %>%
filter(LowerConf>0, ratio!="Inf") %>%
arrange(desc(ratio)) %>%
left_join(PFAM_descriptionTable,by=c('value'='PFAMcode')) %>%
select(desc,ratio, everything())
NA
NA
NA
Compare sequence- and structure-based clusters
blst_Nek <- read_tsv("../data/Nek_kinase_BLASTp.tsv", col_names = TRUE)
Parsed with column specification:
cols(
query_id = col_character(),
subject_id = col_character(),
pct_identity = col_double(),
aln_length = col_integer(),
n_of_mismatches = col_integer(),
gap_openings = col_integer(),
q_start = col_integer(),
q_end = col_integer(),
s_start = col_integer(),
s_end = col_integer(),
e_value = col_double(),
bit_score = col_double()
)
TM_Nek <- read_tsv("../data/Nek_kinase_TM.tsv", col_names = TRUE)
Parsed with column specification:
cols(
struc1 = col_character(),
struc2 = col_character(),
TM = col_double()
)
blst_matFull <- blst_Nek %>% mutate(value=percent_rank(bit_score)) %>%
select(query_id,subject_id,value) %>%
spread(subject_id,value, fill = 0) %>% select(-query_id) %>% as.matrix()
rownames(blst_matFull) <- colnames(blst_matFull)
TM_matFull <- TM_Nek %>% spread(struc2,TM, fill=0) %>% select(-struc1) %>% as.matrix()
rownames(TM_matFull) <- colnames(TM_matFull)
blst_TM_mds <- rbind((1-cor(blst_matFull)) %>%
cmdscale() %>%
as_tibble() %>%
mutate(dtype="blst"),
(1-cor(TM_matFull)) %>%
cmdscale() %>%
as_tibble() %>%
mutate(dtype="TM")) %>%
dplyr::rename(Dim.1 = V1, Dim.2 = V2) %>%
mutate(GeneID=str_replace(rep(rownames(blst_matFull),2),"GL_","GL50803_"))
blst_TM_mds %>%
left_join(Gd_all_RFconf, by="GeneID") %>%
mutate(dtype=recode(dtype,blst="BLAST",TM="TM-align")) %>%
filter(!is.na(RF_confidence)) %>%
dplyr::rename("RF confidence" = RF_confidence) %>%
ggplot(aes(x=Dim.1,y=Dim.2 )) +
geom_point(aes(col=`RF confidence`, shape=`RF confidence`), size=2) +
scale_shape_manual(values=c('HiConf'=1,'HiConf-like' = 1, 'LowerConf' =2, "LowerConf-like" = 2)) +
facet_wrap(~ dtype , scales="free")
ggsave("../results/SF7.pdf",width=9,height=4)
Transcriptional abundance by RF confidence
transcription_cpm <- read_tsv("../data/Ansell_2017_GiardiaCPM.tsv", col_types = cols())
transc_by_category <- Gd_all_RFconf %>%
mutate(transcriptLen=(GDB_AA * 3) - 3) %>%
left_join(transcription_cpm,by="GeneID") %>%
select(GeneID,RF_confidence,transcriptLen,contains('-s')) %>%
gather(key,value,-c(GeneID,transcriptLen,RF_confidence)) %>%
mutate(value=ifelse(is.na(value),0.1,value)) %>%
mutate(value=value/transcriptLen) %>% na.omit() %>%
group_by(GeneID,RF_confidence) %>% summarize(meanVal=mean(value)) %>%
filter(!RF_confidence=="")
transc_by_category %>%
filter(meanVal > 0.001) %>%
ggplot(aes(x=RF_confidence,y=log10(meanVal))) +
geom_boxplot(aes(fill=RF_confidence), width=0.5, col="black", alpha=0.5) +
geom_violin(aes(col=RF_confidence), alpha=0) +
theme(legend.position="none") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
xlab("") + ylab("log10(length-normalized CPM)")
ggsave("../results/SF8.pdf",width=4,height=4)
#Test differences
aov_transc.res <- aov(meanVal ~ RF_confidence,
data=transc_by_category %>% filter(meanVal> 0.001))
summary(aov_transc.res)
Df Sum Sq Mean Sq F value Pr(>F)
RF_confidence 3 50.3 16.754 42.42 <2e-16 ***
Residuals 4343 1715.3 0.395
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
TukeyHSD(aov_transc.res)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = meanVal ~ RF_confidence, data = transc_by_category %>% filter(meanVal > 0.001))
$RF_confidence
diff lwr upr p adj
HiConf-like-HiConf 0.20759312 0.1019366 0.31324968 0.0000028
LowerConf-HiConf -0.16190156 -0.2199994 -0.10380374 0.0000000
LowerConf-like-HiConf -0.10677903 -0.3923471 0.17878900 0.7716333
LowerConf-HiConf-like -0.36949468 -0.4671976 -0.27179175 0.0000000
LowerConf-like-HiConf-like -0.31437215 -0.6105471 -0.01819719 0.0324399
LowerConf-like-LowerConf 0.05512253 -0.2275993 0.33784440 0.9588284
Write supplementary Tables
write_tsv(Gd_all_RFconf %>%
select(-c(seq.ssLen,
nHrc,nErc,nCrc,
strucSum,strpPDB_AA, codechain)) %>%
dplyr::rename(Exact_match_prediction =exactMatchPred,
Length_ratio=lenRatio,Match_status=matchStatus,`AA_%ID`=AApc,
C_score=Cscore,
Annot_status=descBin,Confidence_category =RF_confidence) %>%
arrange(desc(C_score,Confidence_category)),
path="../results/STable_2.tsv")
write_tsv(Gd_all_RFconf %>%
select(-c(seq.ssLen,
nHrc,nErc,nCrc,
strucSum,strpPDB_AA, codechain)) %>%
filter(RF_confidence=="HiConf-like" & descBin=="Hyp") %>%
dplyr::rename(Exact_match_prediction =exactMatchPred,
Length_ratio=lenRatio,Match_status=matchStatus,`AA_%ID`=AApc,
C_score=Cscore,
Annot_status=descBin,Confidence_category =RF_confidence) %>%
arrange(desc(C_score,Confidence_category)),
path="../results/STable_3.tsv")
RF classifier without AA_%ID
TRAIN_nopcAA <- forTRAIN %>% dplyr::select(-c(GeneID,AApc))
rf_model_nopcAA <- train(matchStatus ~. , data=TRAIN_nopcAA, method="rf",
trControl = trainControl(method="cv", number=5, verboseIter=FALSE),
prox=TRUE, verbose=FALSE)
saveRDS(rf_model_nopcAA, "../data/rf_model_nopcAA.Rds")
rf_model_nopcAA <- readRDS("../data/rf_model_nopcAA.Rds")
print(rf_model_nopcAA$finalModel)
Call:
randomForest(x = x, y = y, mtry = param$mtry, proximity = TRUE, verbose = FALSE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 11%
Confusion matrix:
exactMatch noMatch class.error
exactMatch 683 67 0.08933333
noMatch 98 652 0.13066667
Classifier performance without % AA ID.
predTEST_nopcAA <- predict(rf_model_nopcAA, newdata=TEST, type="prob")
predALL_nopcAA <- predict(rf_model_nopcAA, newdata=Gd_forRF, type="prob")
# summary(predTEST_nopcAA)
# summary(predALL_nopcAA)
#accuracy on hold-out TEST set:
TEST %>% mutate(RFexact_pred_nopcAA = predTEST_nopcAA$exactMatch) %>%
mutate(predStatus=ifelse(RFexact_pred_nopcAA > 0.5,"exactMatch","noMatch")) %>%
count(matchStatus, predStatus) %>%
group_by(matchStatus) %>% mutate(gpErr=n/sum(n))
#entire proteome
Gd_forRF %>% mutate(RFexact_pred_nopcAA = predALL_nopcAA$exactMatch) %>%
mutate(predStatus=ifelse(RFexact_pred_nopcAA > 0.5,"exactMatch","noMatch")) %>%
count(matchStatus, predStatus) %>%
group_by(matchStatus) %>% mutate(gpErr=n/sum(n))
Compute feature importance and predictive power
Gd_imp_nopcAA <- data.frame(varImp(rf_model_nopcAA, scale=FALSE)$importance)
Gd_imp_nopcAA$noi <- row.names(Gd_imp_nopcAA); row.names(Gd_imp_nopcAA) <- NULL
names(Gd_imp_nopcAA) <- c('Contribn', 'impFactor')
Gd_imp_nopcAA <- Gd_imp_nopcAA %>%dplyr::select(2,1) %>%
arrange(desc(Contribn)) %>% mutate(propCont = Contribn/sum(Contribn))
imp_plot_nopcAA <- Gd_imp_nopcAA %>%
mutate(impFactor_recode = case_when(impFactor=="Cscore" ~ 'C_score',
impFactor=="Cov" ~ 'Coverage',
impFactor=="exactMatchPred" ~ 'Exact_match_prediction',
impFactor=="lenRatio" ~ 'Length_ratio',
TRUE ~ impFactor)) %>%
ggplot(aes(x=reorder(impFactor_recode, propCont), y=propCont)) +
geom_bar(stat="identity", aes(fill=impFactor_recode), show.legend = FALSE) +
ylab("Importance") + xlab("") +
coord_flip()
imp_plot_nopcAA
ggsave("../results/SF5.pdf", imp_plot_nopcAA, width=4,height=3)