前言
植物的分子遗传研究的重要优势在于遗传群体的易得性。通过设计杂交混合不同来源亲本的基因组,自交获得一系列基因型和表型存在分离的作图群体。
双亲群体经历的连续世代较少,连锁不平衡衰减较大,即重组事件少,重组片段大。重测序会产生全基因组接近饱和的变异,但二代测序易产生错误,导致错误分型,且相邻的SNP紧密连锁,不适用于关联分析和连锁分析。BIN是染色体上连续的不发生的片段,校正了错误分型,降低了计算消耗,是双亲群体重测序的常用手段。
数据准备
founder多态性变异筛选
- 初过滤标准为PASS,二等位,无缺失,次等位频率为50%,测序深度为平均测序深度的一半至两倍
vcftools --vcf pop.snp.filt.vcf.gz
--remove-filtered-all
--max-alleles 2
--min-alleles 2
--max-missing 1
--maf 0.4
--max-maf 0.6
--indv founderA
--indv founderB
--max-meanDP 120
--min-meanDP 30
--recode
--out founder
- 多态性变异筛选,去除存在杂合的变异
grep -v "0/1" founder.recode.vcf|awk '{print$1"\t"$2}'|grep -v "#" > founder.pos
- SNPable筛选高质量位点
apply_mask_l mask_35_50.fa founder.pos > foundermask.pos
offspring高质量变异鉴定
- 初过滤标准为缺失率,maf和founder高质量位点
vcftools
--gzvcf pop.snp.filt.vcf.gz
--max-missing 0.8
--maf 0.05
--positions foundermask.pos
--recode
--out offspring_poly
- 分割染色体
vcftools
--vcf offspring_poly.recode.vcf
--chr Chr1
--recode
--out Chr1
BIN型重建
- vcf转换为SNPbinner输入文件
suppressMessages(library(vcfR))
suppressMessages(library(tidyverse))
suppressMessages(library(optparse))
option_list <- list(
make_option(c("-i", "--input"), type = "character", default = FALSE,
action = "store", help = "This is input vcf!"),
make_option(c("-o", "--out"), type = "character", default = FALSE,
action = "store", help = "This is output file!"),
make_option(c("-a", "--founderA"), type = "character", default = FALSE,
action = "store", help = "This is founder A!"),
make_option(c("-b", "--founderB"), type = "character", default = FALSE,
action = "store", help = "This is founder B!")
)
opt = parse_args(OptionParser(option_list = option_list, usage = "This Script is for trans vcf to tsv!"))
vcf <- read.vcfR(opt$input)
geno <- vcf %>%
extract.gt() %>%
as_tibble(rownames = NA) %>%
rownames_to_column(var = "markername") %>%
mutate(across(.cols = -markername,
~ str_replace_all(., pattern = "\\|", replacement = "\\/"))) %>%
rename(founderA = !!sym(opt$founderA),
founderB = !!sym(opt$founderB)) %>%
mutate(across(.cols = -markername,
~ dplyr::case_when(
. == founderA ~ "a",
. == founderB ~ "b",
. == "0/1" ~ "h",
TRUE ~ "-"
))) %>%
mutate(chrom = sapply(str_split(markername, "_"), `[`, 1) %>% str_replace(pattern = "scaffold", replacement = ""),
position = sapply(str_split(markername, "_"), `[`, 2)) %>%
select(markername, chrom, position, founderA, founderB, everything())
write_tsv(geno, file = opt$out)
Rscript vcf2tsv.r -i Chr1.recode.vcf -a founderA -b founderB -o Chr1.tsv
- snp to bin
singularity exec -e ~/Singularity_lib/python2.sif python2.7
~/software/SNPbinner/snpbinner crosspoints
-i Chr1.tsv
-o Chr1-crosspoints
-r 0.02
-l 38004428
singularity exec -e ~/Singularity_lib/python2.sif python2.7
~/software/SNPbinner/snpbinner bins
-i Chr1-crosspoints
-o Chr1-bin
-l 5000
- bin型合并
library(tidyverse)
file <- fs::dir_ls(path = "../../project/Bipgenetic/Rape/Data/bin/")
tmp <- map_dfr(.x = file,
.f = ~ read.csv(., header = FALSE) %>%
pivot_longer(cols = -V1) %>%
pivot_wider(names_from = V1,
values_from = value) %>%
select(-name),
.id = "Chrtmp") %>%
mutate(chrom = str_sub(Chrtmp, start = -10, end = -5),
markername = str_c(chrom, bin_start, sep = "_")) %>%
select(markername, chrom, everything(), -Chrtmp)
遗传作图
全基因组BIN图
单家系全基因组BIN图
suppressWarnings(suppressMessages(library(tidyverse)))
suppressWarnings(suppressMessages(library(qtl)))
suppressWarnings(suppressMessages(library(data.table)))
suppressWarnings(suppressMessages(library(ggprism)))
path <- "C:/Users/wpf/Desktop/project/Bipgenetic/"
geno <- readxl::read_excel(path = str_c(path, "Rape/Output/geno.xlsx")) %>%
as_tibble() %>%
mutate(across(.cols = -c(markername, chrom, starts_with("bin")),
~ case_when(
. == 0 ~ "AA",
. == 2 ~ "BB",
. == 1 ~ "AB"
)))
prefix <- geno %>%
select(markername, chrom, starts_with("bin"))
tmp <- geno %>%
select(-c(markername, chrom, starts_with("bin")))
tmp <- names(geno)[-c(1:5)] %>%
map_dfc( ~ geno %>%
select(all_of(.x)) %>%
separate(col = .x,
into = str_c(.x, c("_HapA", "_HapB")),
sep = 1)) %>%
bind_cols(prefix) %>%
pivot_longer(cols = -c(markername, chrom, starts_with("bin")),
names_to = c("taxa", "Hap"),
names_sep = "_",
values_to = "geno")
genome <- tmp %>%
group_by(chrom, Hap) %>%
summarise(len = max(bin_end)) %>%
ungroup() %>%
mutate(chr = sort(rep(seq(1, 19), 2)))
## `summarise()` has grouped output by 'chrom'. You can override using the
## `.groups` argument.
tmp %>%
left_join(genome, by = c("chrom", "Hap")) %>%
filter(taxa %in% c(0, 1)) %>% #作图示例
group_nest(taxa) %>%
mutate(plot = map(data, ~ ggplot() +
geom_bar(data = genome,
mapping = aes(x = chr, y = len/1e6, group = Hap),
colour = "white",
stat = "identity",
fill = "white",
width = 0.4,
position = position_dodge2(width = 0.5)) +
scale_x_discrete(limits = unique(genome$chrom),
position = "top") +
scale_y_continuous(breaks = seq(0, 80, 10),
trans = "reverse",
expand = expansion(mult = c(0.05, 0))) +
theme_bw() +
theme(legend.position = c(0.8, 0.2),
plot.background = element_blank() ,
panel.grid.major = element_blank(),
panel.grid.minor = element_blank() ,
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.line.y = element_line()) +
xlab(NULL) + ylab("Loction (Mb)") +
geom_rect(data = .x,
mapping = aes(xmin = chr - 0.23,
xmax = chr + 0.23,
ymin = bin_start/1e6,
ymax = bin_end/1e6,
fill = geno,
group = Hap),
position = position_dodge(width = 0.6)) +
scale_fill_manual(values = c("#4197d8", "#f8c120"),
name = "Genotype"))) %>%
walk2(.x = .$taxa,
.y = .$plot,
.f = ~ print(.y))
群体全基因组BIN图
geno <- readxl::read_excel(path = str_c(path, "Rape/Output/geno.xlsx")) %>%
select(-markername, -bin_center, -B409, -`375`) %>%
pivot_longer(cols = -c(chrom, starts_with("bin")),
names_to = "taxa",
values_to = "geno") %>%
mutate(ind = as.numeric(taxa),
indd = ind + 1)
bin <- geno %>%
group_by(chrom) %>%
summarise(pos = max(bin_end)) %>%
mutate(poscum = cumsum(lag(pos, default = 0)),
add = 4e6,
addcum = cumsum(lag(add, default = 0)),
cum = poscum + addcum) %>%
select(chrom, cum)
tmp <- geno %>%
left_join(bin, by = "chrom") %>%
mutate(start = bin_start + cum,
end = bin_end + cum)
axis <- tmp %>%
group_by(chrom) %>%
summarise(center = mean(end))
ggplot(data = tmp) +
geom_rect(mapping = aes(xmin = ind,
xmax = indd,
ymin = start/1e6,
ymax = end/1e6,
fill = geno)) +
scale_y_continuous(breaks = axis$center/1e6, labels = axis$chrom) +
scale_x_continuous(expand = expansion(mult = c(-0.05, 0))) +
scale_fill_manual(values = c("#4197d8","grey60","#f8c120")) +
theme_bw() +
theme(legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank()) +
xlab(NULL) + ylab(NULL)
遗传连锁图
利用QTL IciMapping软件计算遗传距离,构建遗传连锁图谱。
- 遗传连锁图
map <- readxl::read_excel(path = str_c(path, "Rape/Output/map.xlsx")) %>%
mutate(chrom = sapply(str_split(markername, "_"), `[`, 1)) %>%
select(markername, chrom, pos) %>%
column_to_rownames(var = "markername") %>%
table2map()
plot.map(map)
- 共线性点图
tmp <- map2table(map) %>%
rownames_to_column(var = "markername") %>%
mutate(position = sapply(str_split(markername, "_"), `[`, 2))
genetic <- tmp %>%
group_by(chr) %>%
summarise(gpos = max(pos)) %>%
mutate(gposcum = cumsum(lag(gpos, default = 0))) %>%
select(chr, gposcum)
physic <- tmp %>%
group_by(chr) %>%
summarise(ppos = max(as.numeric(position))) %>%
mutate(pposcum = cumsum(lag(ppos, default = 0))) %>%
select(chr, pposcum)
tmp2 <- tmp %>%
left_join(genetic, by = "chr") %>%
left_join(physic, by = "chr") %>%
mutate(ppos = as.numeric(position) + pposcum,
gpos = pos + gposcum)
axis <- tmp2 %>%
group_by(chr) %>%
summarise(xcenter = mean(gpos),
ycenter = mean(ppos)/1e6)
ggplot(data = tmp2, aes(x = gpos, y = ppos/1e6, colour = chr)) +
geom_point() +
scale_x_continuous(breaks = axis$xcenter, labels = axis$chr) +
scale_y_continuous(breaks = axis$ycenter, labels = axis$chr) +
theme_prism() +
scale_color_manual(values = rep(c("#4197d8", "#f8c120", "#413496", "#495226", "#d60b6f", "#e66519", "#d581b7", "#83d3ad", "#7c162c", "#26755d"), 12)) +
theme(legend.position = "none",
axis.ticks = element_blank(),
axis.text.x = element_text(angle = 45, vjust = 0.6)) +
xlab("linkage Group") + ylab("Genome")
- 共线性圈图
使用Tbtools,利用R处理出输入文件
map <- readxl::read_excel(path = str_c(path, "Rape/Output/map.xlsx")) %>%
mutate(chrom = sapply(str_split(markername, "_"), `[`, 1)) %>%
select(markername, chrom, pos)
genetic <- map %>%
group_by(chrom) %>%
summarise(len = max(pos) * 1e6) %>%
mutate(chrom = str_c("LG", str_pad(row_number(), width = 2, pad = 0)),
rgb = "210,31,67") %>%
select(chrom, len ,rgb)
physic <- map %>%
mutate(pos = sapply(str_split(markername, "_"), `[`, 2) %>% as.numeric()) %>%
group_by(chrom) %>%
summarise(len = max(pos),
rgb = "51,31,209")
genetic %>%
bind_rows(physic) %>%
write_tsv(., file = str_c(path, "Rape/Output/Chrlen.tsv"), col_names = FALSE)
bin <- readxl::read_excel(path = str_c(path,"Rape/Output/geno.xlsx")) %>%
select(markername, chrom, starts_with("bin"))
gmap <- readxl::read_excel(path = str_c(path, "Rape/Output/map.xlsx")) %>%
group_by(chr) %>%
mutate(chrom = str_c("LG", str_pad(chr, width = 2, pad = 0)),
start = lag(pos, default = 0) * 1e6,
end = pos * 1e6)
color <- tibble(
chrom.y = genetic$chrom,
rgb = c("65,151,216", "248,193,32", "65,52,150", "73,82,38", "214,11,111", "230,101,25", "213,129,183", "131,211,173", "124,22,44", "38,117,93",
"65,151,216", "248,193,32", "65,52,150", "73,82,38", "214,11,111", "230,101,25", "213,129,183", "131,211,173", "124,22,44")
)
res <- bin %>%
left_join(gmap, by = "markername") %>%
select(-markername, -bin_center, -pos, -chr) %>%
mutate(across(.cols = - starts_with("chr"),
~ round(.))) %>%
left_join(color, by = "chrom.y")
write_tsv(res, file = str_c(path, "rape/Output/synteny.tsv"), col_names = FALSE)
QTL作图
利用QTL IciMapping软件的ICIM模型进行遗传作图
path <- "C:/Users/wpf/Desktop/project/WinQTLMAP/ICIM/Rape/BIP/RapeRIL/Results/RapeRIL.ric"
tmp <- fread(path) %>%
mutate(chrom = sapply(str_split(LeftMarker, "_"), `[`, 1)) %>%
select(TraitName, Chromosome, Position, LOD, chrom) %>%
group_by(TraitName) %>%
mutate(pos = row_number())
axis <- tmp %>%
group_by(chrom) %>%
summarise(center = mean(pos))
tmp %>%
ungroup() %>%
filter(str_starts(TraitName, "EaC")) %>%
group_nest(TraitName) %>%
mutate(plot = map(data, ~ ggplot() +
geom_line(data = .x,
mapping = aes(x = pos,
y = LOD,
colour = as.factor(Chromosome))) +
geom_hline(yintercept = 2.5,
color = "red",
linetype = "dashed",
alpha = 0.5) +
scale_x_continuous(labels = axis$chrom,
breaks = axis$center) +
scale_color_manual(values = rep(c("#4197d8", "#f8c120", "#413496", "#495226", "#d60b6f", "#e66519", "#d581b7", "#83d3ad", "#7c162c", "#26755d"), 12)) +
theme_prism() +
theme(legend.position = "none",
axis.ticks = element_blank(),
axis.text.x = element_text(angle = 45)) +
xlab(NULL) +
ylab("LOD"))) %>%
walk2(.x = .$TraitName,
.y = .$plot,
.f = ~ print(.y))