View on GitHub

Multi-metagenome

Recovery of complete genomes from metagenomes

Download this project as a .zip file Download this project as a tar.gz file

Wrighton data re-analysis

This small guide show the R part of the extraction of five OD1 genomes from the dataset generated by Wrighton et al., 2012 and re-analysed by Albertsen et al., 2013.

The R markdown version of the guide can be found here as wrighton.workflow.Rmd.

The raw metagenome reads were obtained from SRA and de novo assembled as described in Albertsen et al., 2013.

Download the R formatted data.

Download and upack the Wrighton data used in Albertsen et al., 2013.

download.file('https://dl.dropbox.com/s/irahp88uleuqzv6/wrighton.tar.gz','wrighton.tar.gz', method = 'wget')
untar('wrighton.tar.gz')

Load needed packages

In case you havn’t installed all the needed packages, they can be installed via e.g. install.packages('vegan'). The version of R used to generate this file was:

R.version$version.string
"R version 3.0.1 (2013-05-16)"
library("vegan")
library("plyr")
library("RColorBrewer")
library("alphahull")
library("ggplot2")

Load files associated with the Albertsen de novo assembly of the Wrighton data

All data except the three coverage estimates (artur, dolly and chris) was generated from a fasta file of the assembled scaffolds (assembly.fa) using the script: workflow.R.data.generation.sh. Coverage estimates for the scaffolds was obtained through CLC’s short read mapper.

artur <- read.csv("assembly/artur.csv", header = T)
dolly <- read.csv("assembly/dolly.csv", header = T)
chris <- read.csv("assembly/chris.csv", header = T)

gc <- read.delim("assembly/assembly.gc.tab", header = T)
kmer <- read.delim("assembly/assembly.kmer.tab", header = T)
ess <- read.table("assembly/assembly.orfs.hmm.id.txt", header = F)
ess.tax <- read.delim("assembly/assembly.orfs.hmm.blast.tax.tab", header = F)
cons.tax <- read.delim("assembly/assembly.tax.consensus.txt", header = T)

colnames(kmer)[1] = "name"
colnames(ess) = c("name", "orf", "hmm.id")
colnames(ess.tax) = c("name", "orf", "phylum")
colnames(cons.tax) = c("name", "phylum", "tax.color", "all.assignments")

Merge all data on scaffolds into a single data frame d.

d <- as.data.frame(cbind(artur$Name, artur$Reference.length, gc$gc, artur$Average.coverage, dolly$Average.coverage, chris$Average.coverage), row.names = F)
colnames(d) = c("name", "length", "gc", "artur", "dolly", "chris")
d <- merge(d, cons.tax, by = "name", all = T)

Merge all data on essential genes into a single data frame e.

e <- merge(ess, d, by = "name", all.x = T)
e <- merge(e, ess.tax, by = c("name", "orf"), all.x = T)

Load the original de novo assembly by Wrighton and the associated bins

I added the data as a simple single text file in the org.wrigton folder. The assembled sequences and original bins were obtained from here.

w.d <- read.delim("org.wrighton/acd.genomes.txt", header = T)

Define a few functions for later use

Calculate basic statistics on a set of scaffolds.

genome.stats <- matrix(NA, nrow = 0, ncol = 10)
colnames(genome.stats) <- c("total.length", "# scaffolds", "mean.length", "max.length", "gc", "artur", "dolly", "chris", "tot.ess", "uni.ess")

calc.genome.stats <- function(x, y) matrix(c(sum(x$length), nrow(x), round(mean(x$length), 1), max(x$length), round(sum((x$gc * x$length))/sum(x$length), 1), round(sum((x$artur * x$length))/sum(x$length), 1), round(sum((x$dolly * x$length))/sum(x$length), 1), round(sum((x$chris * x$length))/sum(x$length), 1), nrow(y), length(unique(y$hmm.id))), dimnames = list(colnames(genome.stats), ""))

Extract a subset of scaffolds.

extract <- function(x, a.def, v1, v2) {
    out <- {}
    for (i in 1:nrow(x)) {
        if (inahull(a.def, c(v1[i], v2[i]))) 
            out <- rbind(out, x[i, ])
    }
    return(out)
}

GC color scheme.

rgb.c <- colorRampPalette(c("red", "green", "blue"))
rgb.a <- adjustcolor(rgb.c(max(d$gc) - min(d$gc)), alpha.f = 0.5)

Initial overview of the data

calc.genome.stats(d, e)
total.length 171324044.0
# scaffolds      62199.0
mean.length       2754.5
max.length      220980.0
gc                  43.5
artur               14.7
dolly               15.0
chris               17.5
tot.ess           5800.0
uni.ess            108.0

To get an initial overview of the data we only use scaffolds > 5000 bp.

ds <- subset(d, length > 5000)
es <- subset(e, length > 5000)

Coverage plots - Colored by GC

ggplot(ds, aes(x = artur, y = chris, color = gc, size = length)) + 
  scale_x_log10(limits = c(0.5, 200)) + 
  scale_y_log10(limits = c(1, 300)) + 
  xlab("Coverage (artur)") + 
  ylab("Coverage (chris)") + 
  geom_point(alpha = 0.5) + 
  scale_size_area(name = "Scaffold length", max_size = 10) + 
  scale_colour_gradientn(colours = c("red", "green", "blue"))

plot of chunk Overview_coverage_gc_plot

Original Wrighton coverage plots - Colored by GC

w.ds <- subset(w.d, length > 5000)
ggplot(w.ds, aes(x = artur, y = chris, color = gc, size = length)) + 
  scale_x_log10(limits = c(0.5, 200)) + 
  scale_y_log10(limits = c(1, 300)) + 
  xlab("Coverage (artur)") + 
  ylab("Coverage (chris)") + 
  geom_point(alpha = 0.5) + 
  scale_size_area(name = "Scaffold length", max_size = 10) + 
  scale_colour_gradientn(colours = c("red", "green", "blue"))

plot of chunk Overview_wrighton_coverage_gc_plot

Original Wrighton coverage plots - Colored by bins

ggplot(w.ds, aes(x = artur, y = chris, color = bin, size = length)) + 
  scale_x_log10(limits = c(0.5, 200)) + 
  scale_y_log10(limits = c(1, 300)) + 
  xlab("Coverage (artur)") + 
  ylab("Coverage (chris)") + 
  geom_point(alpha = 0.5) + 
  scale_size_area(name = "Scaffold length", max_size = 10)

plot of chunk Overview_wrighton_coverage_bin_plot

Looking at the 5 orginal OD1 bins from Wrighton

w.d.od1 <- subset(w.d, bin == "ACD7" | bin == "ACD11" | bin == "ACD18" | bin == "ACD81" | bin == "ACD1")
ggplot(w.d.od1, aes(x = artur, y = dolly, color = bin, size = length)) + 
  scale_x_log10(limits = c(0.5, 200)) + 
  scale_y_log10(limits = c(1, 300)) + 
  xlab("Coverage (artur)") + 
  ylab("Coverage (chris)") + 
  geom_point(alpha = 0.5) + 
  scale_size_area(name = "Scaffold length", max_size = 10)

plot of chunk 5OD1_wrighton_coverage_bin_plot

Genome extraction

ACD7 (ID1)

palette(rgb.a)

x <- "artur"
y <- "dolly"

plot(d[, x], d[, y], log = "xy", cex = sqrt(d$length)/100, pch = 20, col = d$gc - min(d$gc), xlim = c(10, 110), ylim = c(110, 330), xlab = "artur", ylab = "dolly")

# def<-locator(100, type='p', pch=20)

def <- {}
def$x <- c(22.53, 29.37, 43.54, 41.99, 32.09, 22.00, 21.65)
def$y <- c(158.24, 198.66, 196.48, 147.04, 112.07, 111.66, 125.12)

selection.A <- ahull(def, alpha = 1e+05)

plot(selection.A, col = "black", add = T)

plot of chunk ACD7_Zoom

Extract all scaffolds and information on essential genes within the defined subspace using the extract function.

s.A <- extract(d, selection.A, d[, x], d[, y])
e.A <- extract(e, selection.A, e[, x], e[, y])

See the basic statistics of the selected scaffolds.

calc.genome.stats(s.A, e.A)
total.length 1411386.0
# scaffolds       59.0
mean.length    23921.8
max.length    110546.0
gc                34.9
artur             31.6
dolly            154.2
chris            151.4
tot.ess           96.0
uni.ess           96.0

Add the genome statistics to a list and print the name of the scaffolds to a file for further refinement.

genome.stats <- rbind(genome.stats, t(calc.genome.stats(s.A, e.A)))
rownames(genome.stats)[nrow(genome.stats)] <- "ACD7"
show(genome.stats)
     total.length # scaffolds mean.length max.length   gc artur dolly chris tot.ess uni.ess
ACD7      1411386          59       23922     110546 34.9  31.6 154.2 151.4      96      96
write.table(g1.s.A$name, file = "ACD7.txt", quote = F, row.names = F, col.names = F)

The name of the scaffolds can be used to extract the scaffolds from the original assembly.fa.

ACD11 (ID7)

palette(rgb.a)

x <- "artur"
y <- "chris"

plot(d[, x], d[, y], log = "xy", cex = sqrt(d$length)/100, pch = 20, col = d$gc - min(d$gc), xlim = c(14, 20), ylim = c(12, 24), xlab = "Coverage (artur)", ylab = "Coverage (chris)")

# def<-locator(100, type='p', pch=20)

def <- {}
def$x <- c(14.49, 15.44, 16.40, 16.43, 15.84, 14.88, 14.18, 14.29)
def$y <- c(18.89, 19.86, 18.51, 16.86, 16.37, 15.91, 16.21, 17.35)

selection.A <- ahull(def, alpha = 1e+05)

plot(selection.A, col = "black", add = T)

plot of chunk ACD11_Zoom

Extract all scaffolds and information on essential genes within the defined subspace using the extract function.

s.A <- extract(d, selection.A, d[, x], d[, y])
e.A <- extract(e, selection.A, e[, x], e[, y])

See the basic statistics of the selected scaffolds.

calc.genome.stats(s.A, e.A)
total.length 1074415.0
# scaffolds       76.0
mean.length    14137.0
max.length    220980.0
gc                38.5
artur             15.4
dolly             29.0
chris             17.5
tot.ess           97.0
uni.ess           91.0

Which of the single copy genes are duplicated? Note that some genomes might have duplicates of some “single copy genes”.

d.A <- e.A[which(duplicated(e.A$hmm.id) | duplicated(e.A$hmm.id, fromLast = TRUE)), ]
d.A[order(d.A$hmm.id), c(1, 3, 9)]
      name     hmm.id       phylum.x
713   1361 PF00750.14           <NA>
4584   536 PF00750.14     Firmicutes
5264   735 PF00750.14     Firmicutes
1897 22123  TIGR01009 Proteobacteria
2928  3244  TIGR01009     Firmicutes
1898 22123  TIGR01044 Proteobacteria
2927  3244  TIGR01044     Firmicutes
1899 22123  TIGR01050 Proteobacteria
2926  3244  TIGR01050     Firmicutes
5261   735  TIGR02012     Firmicutes
5284  7436  TIGR02012           <NA>

As there is multiple genomes in the subset we make a PCA on the scaffolds in the subset.

rda <- rda(kmer[s.A$name, 2:ncol(kmer)], scale = T)
scores <- scores(rda, choices = 1:5)$sites

s.B <- cbind(s.A, scores)
e.B <- merge(e.A, s.B[, c(1, 9:13)], all.x = T, by = "name")
d.B <- merge(d.A, s.B[, c(1, 9:13)], all.x = T, by = "name")

We use the pairs function to plot the first 3 components along with GC and coverages.

palette(rgb.a)
pairs(s.B[, c(4, 5, 6, 3, 10:12)], upper.panel = NULL, col = s.B$gc - min(d$gc), cex = sqrt(s.B$length)/100, pch = 20)

plot of chunk ACD11_plot_PCA

GC and PC1 seem to seperate our target genome from the other scaffolds and is therefore used for another extraction using the locator function.

palette(rgb.a)

x <- "PC1"
y <- "gc"

plot(s.B[, x], s.B[, y], cex = sqrt(s.B$length)/100, pch = 20, col = s.B$gc - min(d$gc), xlab = x, ylab = y)

palette(brewer.pal(9, "Set1"))

points(s.B[, x], s.B[, y], col = s.B$tax.color + 1, pch = 20)

# def<-locator(100, type='p', pch=20)

def <- {}
def$x <- c(-2.24, -1.92, -1.32, -1.11, -1.35, -1.91, -2.21)
def$y <- c(36.54, 41.04, 41.34, 38.59, 33.38, 30.21, 30.93)

selection.B <- ahull(def, alpha = 1e+05)

plot(selection.B, col = "black", add = T)

plot of chunk ACD11_extract_PCA

Again the extract function is used to retrive the scaffolds in the selected subset.

s.C <- extract(s.B, selection.B, s.B[, x], s.B[, y])
e.C <- extract(e.B, selection.B, e.B[, x], e.B[, y])

See the basic statistics of the selected scaffolds.

calc.genome.stats(s.C, e.C)
total.length 955434.0
# scaffolds      22.0
mean.length   43428.8
max.length   220980.0
gc               36.3
artur            15.4
dolly            31.7
chris            17.4
tot.ess          91.0
uni.ess          89.0

There are a few duplicated “single copy genes” however in this case it is not due to mulitple species in the bin, but real duplicates in the genome.

d.C <- e.C[which(duplicated(e.C$hmm.id) | duplicated(e.C$hmm.id, fromLast = TRUE)), ]
d.C[order(d.C$hmm.id), c(1, 3, 9)]
   name     hmm.id   phylum.x
4   536 PF00750.14 Firmicutes
21  735 PF00750.14 Firmicutes
28 1361 PF00750.14       <NA>

Add the genome statistics to a list and print the name of the scaffolds to a file for further refinement.

genome.stats <- rbind(genome.stats, t(calc.genome.stats(s.C, e.C)))
rownames(genome.stats)[nrow(genome.stats)] <- "ACD11"
show(genome.stats)
      total.length # scaffolds mean.length max.length   gc artur dolly chris tot.ess uni.ess
ACD7       1411386          59       23922     110546 34.9  31.6 154.2  151.4      96      96
ACD11       955434          22       43429     220980 36.3  15.4  31.7   17.4      91      89
write.table(s.C$name, file = "ACD11.txt", quote = F, row.names = F, col.names = F)

ACD18 (ID3)

palette(rgb.a)

x <- "artur"
y <- "chris"

plot(d[, x], d[, y], log = "xy", cex = sqrt(d$length)/100, pch = 20, col = d$gc - min(d$gc), xlim = c(5, 15), ylim = c(20, 35), xlab = "Coverage (artur)", ylab = "Coverage (chris)")

# def<-locator(100, type='p', pch=20)

def <- {}
def$x <- c(7.12, 7.75, 8.80, 10.54, 10.91, 10.97, 9.70, 8.65, 7.66, 6.81, 6.96)
def$y <- c(26.89, 28.41, 30.11, 29.28, 27.47, 25.55, 24.59, 23.85, 22.95, 23.46, 25.60)

selection.A <- ahull(def, alpha = 1e+05)

plot(selection.A, col = "black", add = T)

plot of chunk ACD18_Zoom

Extract all scaffolds and information on essential genes within the defined subspace using the extract function.

s.A <- extract(d, selection.A, d[, x], d[, y])
e.A <- extract(e, selection.A, e[, x], e[, y])

See the basic statistics of the selected scaffolds.

calc.genome.stats(s.A, e.A)
total.length 1230983.0
# scaffolds      110.0
mean.length    11190.8
max.length     65437.0
gc                34.0
artur              8.9
dolly              8.6
chris             26.7
tot.ess           92.0
uni.ess           86.0

Which of the single copy genes are duplicated? Note that some genomes might have duplicates of some “single copy genes”.

d.A <- e.A[which(duplicated(e.A$hmm.id) | duplicated(e.A$hmm.id, fromLast = TRUE)), ]
d.A[order(d.A$hmm.id), c(1, 3, 9)]
      name     hmm.id       phylum.x
3795 43010 PF00297.17 Proteobacteria
4944  5995 PF00297.17     Firmicutes
4285 49498 PF00411.14           <NA>
4615   540 PF00411.14     Firmicutes
4286 49498 PF00416.17           <NA>
4617   540 PF00416.17     Firmicutes
3796 43010 PF00573.17 Proteobacteria
4942  5995 PF00573.17     Firmicutes
607  13098  TIGR00436           <NA>
5431  8080  TIGR00436     Firmicutes
4613   540  TIGR02027     Firmicutes
5630  9043  TIGR02027 Proteobacteria

As there is multiple genomes in the subset we make a PCA on the scaffolds in the subset.

rda <- rda(kmer[s.A$name, 2:ncol(kmer)], scale = T)
scores <- scores(rda, choices = 1:5)$sites

s.B <- cbind(s.A, scores)
e.B <- merge(e.A, s.B[, c(1, 9:13)], all.x = T, by = "name")
d.B <- merge(d.A, s.B[, c(1, 9:13)], all.x = T, by = "name")

We use the pairs function to plot the first 5 components. I’ve also added GC and coverage.

palette(rgb.a)
pairs(s.B[, c(4, 5, 6, 3, 10:12)], upper.panel = NULL, col = s.B$gc - min(d$gc), cex = sqrt(s.B$length)/100, pch = 20)

plot of chunk ACD18_plot_PCA

PC1 and PC2 seem to seperate our target genome from the other scaffolds and is therefore used for another extraction using the locator function.

palette(rgb.a)

x <- "PC1"
y <- "PC2"

plot(s.B[, x], s.B[, y], cex = sqrt(s.B$length)/100, pch = 20, col = s.B$gc - min(d$gc), xlab = x, ylab = y)

palette(brewer.pal(9, "Set1"))

points(s.B[, x], s.B[, y], col = s.B$tax.color + 1, pch = 20)

# def<-locator(100, type='p', pch=20)

def <- {}
def$x <- c(-1.39, -0.89, -0.27, -0.44, -1.12, -1.40)
def$y <- c(1.75, 1.93, 0.80, -2.00, -2.27, -0.54)

selection.B <- ahull(def, alpha = 1e+05)

plot(selection.B, col = "black", add = T)

plot of chunk ACD18_Extract_PCA

Again the extract function is used to retrive the scaffolds in the selected subset.

s.C <- extract(s.B, selection.B, s.B[, x], s.B[, y])
e.C <- extract(e.B, selection.B, e.B[, x], e.B[, y])

See the basic statistics of the selected scaffolds.

calc.genome.stats(s.C, e.C)
total.length 1110432.0
# scaffolds       72.0
mean.length    15422.7
max.length     65437.0
gc                32.3
artur              8.8
dolly              8.5
chris             26.7
tot.ess           86.0
uni.ess           85.0

There are a few duplicated “single copy genes” however in this case it is not due to mulitple species in the bin, but real duplicates in the genome.

d.C <- e.C[which(duplicated(e.C$hmm.id) | duplicated(e.C$hmm.id, fromLast = TRUE)), ]
d.C[order(d.C$hmm.id), c(1, 3, 9)]
    name    hmm.id   phylum.x
73  8080 TIGR00436 Firmicutes
80 13098 TIGR00436       <NA>

Add the genome statistics to a list and print the name of the scaffolds to a file for further refinement.

genome.stats <- rbind(genome.stats, t(calc.genome.stats(s.C, e.C)))
rownames(genome.stats)[nrow(genome.stats)] <- "ACD18"
show(genome.stats)
      total.length # scaffolds mean.length max.length   gc artur dolly chris tot.ess uni.ess
ACD7       1411386          59       23922     110546 34.9  31.6 154.2  151.4      96      96
ACD11       955434          22       43429     220980 36.3  15.4  31.7   17.4      91      89
ACD18      1110432          72       15423      65437 32.3   8.8   8.5   26.7      86      85
write.table(s.C$name, file = "ACD18.txt", quote = F, row.names = F, col.names = F)

ACD81 (ID16)

palette(rgb.a)

x <- "artur"
y <- "chris"

plot(d[, x], d[, y], log = "xy", cex = sqrt(d$length)/100, pch = 20, col = d$gc - min(d$gc), xlim = c(50, 400), ylim = c(20, 100), xlab = "Coverage (artur)", ylab = "Coverage (chris)")

# def<-locator(100, type='p', pch=20)

def <- {}
def$x <- c(141.32, 182.22, 220.68, 206.56, 154.17, 115.47, 91.13, 106.59)
def$y <- c(77.13, 89.70, 82.59, 66.62, 52.24, 42.95, 47.99, 59.07)

selection.A <- ahull(def, alpha = 1e+05)

plot(selection.A, col = "black", add = T)

plot of chunk ACD81_Zoom

Extract all scaffolds and information on essential genes within the defined subspace using the extract function.

s.A <- extract(d, selection.A, d[, x], d[, y])
e.A <- extract(e, selection.A, e[, x], e[, y])

See the basic statistics of the selected scaffolds.

calc.genome.stats(s.A, e.A)
##                      
## total.length 835481.0
## # scaffolds     161.0
## mean.length    5189.3
## max.length    85982.0
## gc               43.3
## artur           147.5
## dolly            12.4
## chris            63.8
## tot.ess          76.0
## uni.ess          74.0

Which of the single copy genes are duplicated? Note that some genomes might have duplicates of some “single copy genes”.

d.A <- e.A[which(duplicated(e.A$hmm.id) | duplicated(e.A$hmm.id, fromLast = TRUE)), ]
d.A[order(d.A$hmm.id), c(1, 3, 9)]
     name    hmm.id   phylum.x
3170 3520 TIGR00442 Firmicutes
5429  802 TIGR00442       <NA>
5231 7191 TIGR00981 Firmicutes
5482  828 TIGR00981       <NA>

As there is multiple genomes in the subset we make a PCA on the scaffolds in the subset.

rda <- rda(kmer[s.A$name, 2:ncol(kmer)], scale = T)
scores <- scores(rda, choices = 1:5)$sites

s.B <- cbind(s.A, scores)
e.B <- merge(e.A, s.B[, c(1, 9:13)], all.x = T, by = "name")
d.B <- merge(d.A, s.B[, c(1, 9:13)], all.x = T, by = "name")

We use the pairs function to plot the first 3 components. I’ve also added GC and coverage.

palette(rgb.a)
pairs(s.B[, c(4, 5, 6, 3, 10:12)], upper.panel = NULL, col = s.B$gc - min(d$gc), cex = sqrt(s.B$length)/100, pch = 20)

plot of chunk ACD81_plot_PCA

Dolly and PC1 seem to seperate our target genome from the other scaffolds and is therefore used for another extraction using the locator function.

palette(rgb.a)

x <- "dolly"
y <- "PC1"

plot(s.B[, x], s.B[, y], cex = sqrt(s.B$length)/100, pch = 20, col = s.B$gc - min(d$gc), xlab = x, ylab = y)

palette(brewer.pal(9, "Set1"))

points(s.B[, x], s.B[, y], col = s.B$tax.color + 1, pch = 20)

# def<-locator(100, type='p', pch=20)

def <- {}
def$x <- c(4.70, 10.45, 21.96, 25.94, 19.75, 6.91, 4.26, 3.48)
def$y <- c(3.08, 4.14, 3.67, 0.09, -2.74, -3.08, -0.75, 1.24)

selection.B <- ahull(def, alpha = 1e+05)

plot(selection.B, col = "black", add = T)

plot of chunk ACD81_extract_PCA

Again the extract function is used to retrive the scaffolds in the selected subset.

s.C <- extract(s.B, selection.B, s.B[, x], s.B[, y])
e.C <- extract(e.B, selection.B, e.B[, x], e.B[, y])

See the basic statistics of the selected scaffolds.

calc.genome.stats(s.C, e.C)
total.length 826830.0
# scaffolds     155.0
mean.length    5334.4
max.length    85982.0
gc               43.4
artur           147.6
dolly            11.9
chris            63.7
tot.ess          76.0
uni.ess          74.0

There are a few duplicated “single copy genes”. In this case it might indicate that the bin includes a small amount of another bacteria. It can be cleaned up by using the cytoscape network graph approach.

d.C <- e.C[which(duplicated(e.C$hmm.id) | duplicated(e.C$hmm.id, fromLast = TRUE)), ]
d.C[order(d.C$hmm.id), c(1, 3, 9)]
   name    hmm.id   phylum.x
53  802 TIGR00442       <NA>
67 3520 TIGR00442 Firmicutes
56  828 TIGR00981       <NA>
75 7191 TIGR00981 Firmicutes

Add the genome statistics to a list and print the name of the scaffolds to a file for further refinement.

genome.stats <- rbind(genome.stats, t(calc.genome.stats(s.C, e.C)))
rownames(genome.stats)[nrow(genome.stats)] <- "ACD81"
show(genome.stats)
      total.length # scaffolds mean.length max.length   gc artur dolly  chris tot.ess uni.ess
ACD7       1411386          59       23922     110546 34.9  31.6 154.2  151.4      96      96
ACD11       955434          22       43429     220980 36.3  15.4  31.7   17.4      91      89
ACD18      1110432          72       15423      65437 32.3   8.8   8.5   26.7      86      85
ACD81       826830         155        5334      85982 43.4 147.6  11.9   63.7      76      74
write.table(s.C$name, file = "ACD81.txt", quote = F, row.names = F, col.names = F)

ACD1 (ID2)

palette(rgb.a)

x <- "artur"
y <- "chris"

plot(d[, x], d[, y], log = "xy", cex = sqrt(d$length)/100, pch = 20, col = d$gc - min(d$gc), xlim = c(10, 40), ylim = c(50, 200), xlab = "Coverage (artur)", ylab = "Coverage (chris)")

# def<-locator(100, type='p', pch=20)

def <- {}
def$x <- c(13.56, 18.23, 22.18, 20.97, 14.47, 12.58)
def$y <- c(137.01, 183.62, 163.20, 139.54, 111.12, 113.64)

selection.A <- ahull(def, alpha = 1e+05)

plot(selection.A, col = "black", add = T)

plot of chunk ACD1_Zoom

Extract all scaffolds and information on essential genes within the defined subspace using the extract function.

s.A <- extract(d, selection.A, d[, x], d[, y])
e.A <- extract(e, selection.A, e[, x], e[, y])

See the basic statistics of the selected scaffolds.

calc.genome.stats(s.A, e.A)
total.length 1269939.0
# scaffolds       23.0
mean.length    55214.7
max.length    176667.0
gc                38.3
artur             17.5
dolly            249.1
chris            149.2
tot.ess           97.0
uni.ess           94.0

Which of the single copy genes are duplicated? Note that some genomes might have duplicates of some “single copy genes”.

d.A <- e.A[which(duplicated(e.A$hmm.id) | duplicated(e.A$hmm.id, fromLast = TRUE)), ]
d.A[order(d.A$hmm.id), c(1, 3, 9)]
     name     hmm.id   phylum.x
2316 2608 PF00750.14 Firmicutes
4299 4971 PF00750.14 Firmicutes
943  1510  TIGR00029 Firmicutes
2390  270  TIGR00029       <NA>
357   119  TIGR00442 Firmicutes
1685 2054  TIGR00442 Firmicutes

As there is multiple genomes in the subset we make a PCA on the scaffolds in the subset.

rda <- rda(kmer[s.A$name, 2:ncol(kmer)], scale = T)
scores <- scores(rda, choices = 1:5)$sites

s.B <- cbind(s.A, scores)
e.B <- merge(e.A, s.B[, c(1, 9:13)], all.x = T, by = "name")
d.B <- merge(d.A, s.B[, c(1, 9:13)], all.x = T, by = "name")

We use the pairs function to plot the first 3 components. I’ve also added GC and coverage. There is nothing obiouvs that can be removed.

palette(rgb.a)
pairs(s.B[, c(4, 5, 6, 3, 10:12)], upper.panel = NULL, col = s.B$gc - min(d$gc), cex = sqrt(s.B$length)/100, pch = 20)

plot of chunk ACD1_plot_PCA

Add the genome statistics to a list and print the name of the scaffolds to a file for further refinement.

genome.stats <- rbind(genome.stats, t(calc.genome.stats(s.B, e.B)))
rownames(genome.stats)[nrow(genome.stats)] <- "ACD1"
show(genome.stats)
      total.length # scaffolds mean.length max.length   gc artur dolly  chris tot.ess uni.ess
ACD7       1411386          59       23922     110546 34.9  31.6 154.2  151.4      96      96
ACD11       955434          22       43429     220980 36.3  15.4  31.7   17.4      91      89
ACD18      1110432          72       15423      65437 32.3   8.8   8.5   26.7      86      85
ACD81       826830         155        5334      85982 43.4 147.6  11.9   63.7      76      74
ACD1       1269939          23       55215     176667 38.3  17.5 249.1  149.2      97      94
write.table(s.B$name, file = "ACD1.txt", quote = F, row.names = F, col.names = F)