NGS可视化

2024-03-22 约 1241 字预计阅读 3 分钟

数据准备

创建合适的文件结构，使用软连接连接对应bam文件

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


#all_bam.txt
/home/data/yuanh/work_data/230612_encher_data/00.shoot_1/result/00.DNase/bam/shoot_DNase-seq.bam
/home/data/yuanh/work_data/230612_encher_data/00.shoot_1/result/01.ATAC_seq/bam/shoot_ATAC-seq.bam
/home/data/yuanh/work_data/230612_encher_data/00.shoot_1/result/02.Chip_seq/bam/H3K9ac_Chip-seq.bam
/home/data/yuanh/work_data/230612_encher_data/00.shoot_1/result/02.Chip_seq/bam/shoot_1_H3K27ac.rmdup.bam
/home/data/yuanh/work_data/230612_encher_data/00.shoot_1/result/02.Chip_seq/bam/shoot_1_H3K4me3.rmdup.bam
/home/data/yuanh/work_data/230612_encher_data/00.shoot_1/result/03.methylation/bam/output-prefix.bsmap.mkdup.bam
/home/data/yuanh/work_data/230612_encher_data/01.ear_1/result/01.ATAC_seq/bam/Zmays_ear_small.bam
/home/data/yuanh/work_data/230612_encher_data/01.ear_1/result/02.Chip_seq/bam/ear_1_H3K27ac.rmdup.bam
/home/data/yuanh/work_data/230612_encher_data/01.ear_1/result/02.Chip_seq/bam/ear_1_H3K4me3.rmdup.bam
/home/data/yuanh/work_data/230612_encher_data/02_ear_2/result/01.ATAC_seq/bam/ear_ATAC-seq.bam
/home/data/yuanh/work_data/230612_encher_data/02_ear_2/result/02.Chip_seq/bam/ear_2_H3K27me3.rmdup.bam
/home/data/yuanh/work_data/230612_encher_data/02_ear_2/result/02.Chip_seq/bam/ear_2_H3K4me3.rmdup.bam
/home/data/yuanh/work_data/230612_encher_data/02_ear_2/result/02.Chip_seq/bam/ear_2_H3K9ac.rmdup.bam
/home/data/yuanh/work_data/230612_encher_data/02_ear_2/result/03.methylation/bam/output-prefix.bsmap.mkdup.bam
/home/data/yuanh/work_data/230612_encher_data/03.tassel/result/01.ATAC_seq/bam/tassel_ATAC-seq.bam
/home/data/yuanh/work_data/230612_encher_data/03.tassel/result/02.Chip_seq/bam/tassel_H3K27me3.rmdup.bam
/home/data/yuanh/work_data/230612_encher_data/03.tassel/result/02.Chip_seq/bam/tassel_H3K4me3.rmdup.bam
/home/data/yuanh/work_data/230612_encher_data/03.tassel/result/02.Chip_seq/bam/tassel_H3K9ac.rmdup.bam
/home/data/yuanh/work_data/230612_encher_data/03.tassel/result/03.methylation/bam/output-prefix.bsmap.mkdup.bam
/home/data/yuanh/work_data/230612_encher_data/04.shoot_2/result/00.DNase/bam/DNase.bam
/home/data/yuanh/work_data/230612_encher_data/04.shoot_2/result/01.ATAC_seq/bam/Zmays_young_leaf_2c.bam
/home/data/yuanh/work_data/230612_encher_data/04.shoot_2/result/02.Chip_seq/bam/H3K9ac_Chip-seq.bam
/home/data/yuanh/work_data/230612_encher_data/04.shoot_2/result/02.Chip_seq/bam/SRR9016266_H3K27ac_ChIP-seq_rep1_picard_.sort.bam.bam
/home/data/yuanh/work_data/230612_encher_data/04.shoot_2/result/02.Chip_seq/bam/SRR9016267_H3K4me1_ChIP-seq_Rep1_picard_.sort.bam.bam
/home/data/yuanh/work_data/230612_encher_data/04.shoot_2/result/02.Chip_seq/bam/SRR9016268_RNA11_ChIP-seq_Rep1_picard_.sort.bam.bam
/home/data/yuanh/work_data/230612_encher_data/04.shoot_2/result/02.Chip_seq/bam/SRR9016269_H3k27me3_ChIP-seq_Rep1_picard_.sort.bam.bam
/home/data/yuanh/work_data/230612_encher_data/04.shoot_2/result/02.Chip_seq/bam/SRR9016270_H3K4me3_ChIP-seq_Rep1_picard_.sort.bam.bam
/home/data/yuanh/work_data/230612_encher_data/04.shoot_2/result/03.methylation/bam/shoot_2_methylation.sorted.bam

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23


#! /bin/bash

while IFS= read -r line; do
  # tissue、omic、file_path
  file_path="$line"
  omic=$(echo "$file_path" | awk -F'/' '{gsub(/[0-9]+\.|_/, "", $(NF-2)); print $(NF-2)}')
  tissue=$(echo "$file_path" | awk -F'/' '{gsub(/[0-9]+\.|_/, "", $(NF-4)); print $(NF-4)}')

  # mkdir
  target_dir="./$tissue/$omic/bam"
  echo "$target_dir"
  mkdir -p "$target_dir"

  # ln -s
  if [ "$omic" == "Chip_seq" ]; then
    # Chip_seq
    link_name=$(basename "$file_path" | grep -oP 'H[123]K[1-9]*(me[123]|ac)|RNA11')
  else
    #omic
    link_name="$omic.bam"
  fi
  ln -s "$file_path" "$target_dir/$link_name"
done < all_bam.txt

递归查询有多少软链接建立，和txt中文件路径相符

1
2
3


(base) yuanhx@T640server 23:08:13 ~/dzx/ultimate/1.input_circumstance/data
$ find ./ -type l |wc -l
28

deeptools去处理链特异性的数据

我们要分析测序数据在基因上的分布特征，显然不能将来自于负链基因数据统计到同一区域的正链基因上。 一个简单的思路就是：

首先将BAM文件根据链拆分成正链和负链的BAM文件；
再用正链的BAM文件去和正链基因的BED文件去 computeMatrix，负链同理；
最后再将正负链 computeMatrix的结果进行合并。

bam文件区分正负链

1
2
3
4
5


# 选择比对到正链的reads
samtools view -F 16 -b -o positive.bam example.bam

# 选择比对到负链的reads
samtools view -f 16 -b -o negative.bam example.bam 

“-f"参数用于选择具有特定标志的读取，"-F"参数用于排除具有特定标志的读取

SAM/BAM文件中，每个读取都有一个FLAG字段，这个字段是一个位标志，用于表示读取的各种属性，这些属性包括读取是否配对，是否比对到参考序列，是否是配对中的第一读取，等等

对于链特异性，我们关注的是第5位，如果这一位是1，那么读取比对到参考序列的负链，如果是0，那么读取比对到参考序列的正链。这一位的值是16，所以使用16作为参数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12


#divide the bam files into positive and negative
bam_files=$(find ./ -type l)
for bam in $bam_files; do
    #remove the .bam which is the last 4 characters
    file_name=${bam::-4}
    samtools view -@ 9 -f 16 -b -o ${file_name}_negative.bam $bam
    samtools view -@ 9 -F 16 -b -o ${file_name}_positive.bam $bam
    samtools index ${file_name}_negative.bam ${file_name}_negative.bam.bai
    samtools index ${file_name}_positive.bam ${file_name}_positive.bam.bai
    bamCoverage -b ${file_name}_negative.bam -o ${file_name}_negative.bw --binSize 10 --normalizeUsing RPKM --scaleFactor 1 --numberOfProcessors 10
    bamCoverage -b ${file_name}_positive.bam -o ${file_name}_positive.bw --binSize 10 --normalizeUsing RPKM --scaleFactor 1 --numberOfProcessors 10
done

bed基因文件区分正负链

1
2
3
4


#divide gene.bed into positive and negative
bed="./gene.bed"
awk '{if($5=="+") print $0}' $bed > gene_positive.bed
awk '{if($5=="-") print $0}' $bed > gene_negative.bed

分正负链基因计算reads富集分数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


#! /bin/bash
# Path: static/NGS_visualization/script

#sbatch --dependency=afterok:31482 ./run.slurm

#using computeMatrix to calculate the matrix
bam_files=$(find ./ -type l| grep "bam$")
for bam in $bam_files; do
    file_path=${dirname(dirname($bam))}
    mkdir -p $file_path/matrix
    omic=${basename(bam::-4)}
    file_name=${bam::-4}
    computeMatrix scale-regions -p 12 \
      -S ${file_name}_positive.bw \
      -R gene_positive.bed \
      -b 1000 -a 1000 \
      -o $file_path/matrix/${omic}_positive.gz \
      --binSize 10 --missingDataAsZero \
      --skipZeros \
      --outFileNameMatrix $file_path/matrix/${omic}_positive.tab
    computeMatrix scale-regions -p 12 \
      -S ${file_name}_negative.bw \
      -R gene_negative.bed \
      -b 1000 -a 1000 \
      -o $file_path/matrix/${omic}_negative.gz
      --binSize 10 --missingDataAsZero \
      --skipZeros \
      --outFileNameMatrix $file_path/matrix/${omic}_negative.tab
done

合并正负链tab文件

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


 #合并正负链tab文件
bam_files=$(find ./ -type l| grep "bam$")
for bam in $bam_files; do
    file_path=$(dirname "$(dirname "$bam")")
    omic=$(basename "${bam::-4}")
    #根据两个文件表头确定新表头
    #first line
    group_boundaries_positive=$(zcat $file_path/matrix/${omic}_positive.gz | head -n 1 | grep -oP '"group_boundaries":\[\K[^\]]*')
    group_boundaries_negative=$(zcat $file_path/matrix/${omic}_negative.gz | head -n 1 | grep -oP '"group_boundaries":\[\K[^\]]*')
    # 计算新的 group_boundaries 的值
    num1=$(echo $group_boundaries_positive | awk -F ',' '{print $2}')
    num2=$(echo $group_boundaries_negative | awk -F ',' '{print $2}')
    new_group_boundaries="0,$((num1+num2))"
    # 从 positive 文件中提取 JSON 数据，并将 group_boundaries 和 sample_labels 的值进行修改
    new_json=$(zcat $file_path/matrix/${omic}_positive.gz | head -n 1 | sed "s/\"group_boundaries\":\[[^]]*\]/\"group_boundaries\":[$new_group_boundaries]/g" | sed "s/\"sample_labels\":\[[^]]*\]/\"sample_labels\":[\"$omic\"]/g")
    # 输出新的 JSON 数据
    echo $new_json > $file_path/matrix/${omic}.tab

    #每个gz文件去除第一行行合并
    zcat $file_path/matrix/${omic}_positive.gz | tail -n +2 >> $file_path/matrix/${omic}.tab
    zcat $file_path/matrix/${omic}_negative.gz | tail -n +2 >> $file_path/matrix/${omic}.tab
    #gzip the tab file
    gzip -c $file_path/matrix/${omic}.tab > $file_path/matrix/${omic}.tab.gz
done

画图

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20


#plotHeatmap
#! /bin/bash

bam_files=$(find ./ -type l| grep "bam$")
for bam in $bam_files; do
    file_path=$(dirname "$(dirname "$bam")")
    omic=$(basename "${bam::-4}")
    mkdir -p $file_path/plotheatmap
    plotHeatmap -m $file_path/matrix/${omic}.tab.gz \
      -out $file_path/plotheatmap/${omic}.pdf \
      --dpi 300 \
      --colorMap YlGnBu  \
      --missingDataColor "#FFF6EB"\
      --heatmapHeight 21 \
      --startLabel "TSS" \
      --endLabel "TTS" \
      --regionsLabel "gene" \
      --legendLocation "none" \
      --plotTitle $omic 
done

目录

NGS可视化