{"lab": {"status": "current", "display_title": "4DN DCIC, HMS", "@type": ["Lab", "Item"], "uuid": "828cd4fe-ebb0-4b36-a94a-d2e3a36cc989", "@id": "/labs/4dn-dcic-lab/", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.lab_submitter", "submits_for.828cd4fe-ebb0-4b36-a94a-d2e3a36cc989"]}}, "award": {"status": "current", "display_title": "4D NUCLEOME NETWORK DATA COORDINATION AND INTEGRATION CENTER - PHASE I", "uuid": "b0b9c607-f8b4-4f02-93f4-9895b461334b", "@type": ["Award", "Item"], "@id": "/awards/1U01CA200059-01/", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin"]}}, "title": "Capture Hi-C", "status": "released", "cfde_term": {"status": "released", "display_title": "capture Hi-C assay", "@type": ["OntologyTerm", "Item"], "@id": "/ontology-terms/OBI:0002984/", "uuid": "c2b06196-9a76-4584-bea2-923df499ccdb", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin"]}}, "other_tags": ["DNA-DNA", "3D", "Pairwise"], "date_created": "2019-03-28T18:42:40.401137+00:00", "submitted_by": {"error": "no view permissions"}, "last_modified": {"modified_by": {"error": "no view permissions"}, "date_modified": "2024-07-26T17:22:09.103584+00:00"}, "raw_file_types": "Reads (fastq) provided by lab", "reference_pubs": [{"display_title": "Mifsud B et al. (2015) PMID:25938943", "journal": "Nature genetics", "uuid": "2058cadb-536f-4735-9b59-d5f462bce0ca", "short_attribution": "Mifsud B et al. (2015)", "@type": ["Publication", "Item"], "date_published": "2015-06", "status": "current", "@id": "/publications/2058cadb-536f-4735-9b59-d5f462bce0ca/", "authors": ["Mifsud B", "Tavares-Cadete F", "Young AN", "Sugar R", "Schoenfelder S", "Ferreira L", "Wingett SW", "Andrews S", "Grey W", "Ewels PA", "Herman B", "Happe S", "Higgs A", "LeProust E", "Follows GA", "Fraser P", "Luscombe NM", "Osborne CS"], "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin"]}}], "schema_version": "1", "static_content": [{"content": {"options": {"filetype": "md", "collapsible": false, "default_open": true}, "filetype": "md", "content": "The 4DN Hi-C data processing pipeline includes alignment, filtering, and matrix aggregation and normalization steps. Feature calling steps, including insulation scores, compartments, and enriched contacts will be provided in the next version of the pipeline.\n\n<dl>\n<a href=\"https://s3.amazonaws.com/4dn-dcic-public/static-pages/hicpipeline.png\" target=\"_blank\">\n<img src=\"https://s3.amazonaws.com/4dn-dcic-public/static-pages/hicpipeline.png\" width=\"100%\"/>\n</a>\n</dl>\n\n\n", "display_title": "Overview", "uuid": "8bf6a722-5841-4a6a-9e4b-947dcf5d7af3", "@type": ["StaticSection", "UserContent", "Item"], "award": {"status": "current", "@type": ["Award", "Item"], "@id": "/awards/1U01CA200059-01/", "uuid": "b0b9c607-f8b4-4f02-93f4-9895b461334b", "display_title": "4D NUCLEOME NETWORK DATA COORDINATION AND INTEGRATION CENTER - PHASE I", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin"]}}, "status": "released", "lab": {"status": "current", "@id": "/labs/4dn-dcic-lab/", "@type": ["Lab", "Item"], "display_title": "4DN DCIC, HMS", "uuid": "828cd4fe-ebb0-4b36-a94a-d2e3a36cc989", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.lab_submitter", "submits_for.828cd4fe-ebb0-4b36-a94a-d2e3a36cc989"]}}, "@id": "/static-sections/8bf6a722-5841-4a6a-9e4b-947dcf5d7af3/", "content_as_html": "<div class=\"markdown-container\"><p>The 4DN Hi-C data processing pipeline includes alignment, filtering, and matrix aggregation and normalization steps. Feature calling steps, including insulation scores, compartments, and enriched contacts will be provided in the next version of the pipeline.</p>\n<dl>\n<a href=\"https://s3.amazonaws.com/4dn-dcic-public/static-pages/hicpipeline.png\" rel=\"noopener noreferrer\" target=\"_blank\">\n<img src=\"https://s3.amazonaws.com/4dn-dcic-public/static-pages/hicpipeline.png\" width=\"100%\"/>\n</a>\n</dl></div>", "name": "resources.data-analysis.hi_c-processing-pipeline.overview", "title": "Overview", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.owner", "userid.986b362f-4eb6-4a9c-8173-3ab267307e3a"]}}, "location": "tab:data_processing"}, {"content": {"options": {"filetype": "html", "collapsible": false, "default_open": true}, "filetype": "html", "content": "Below is a description of the file formats available on the portal as outputs of the data processing pipeline. \n\n<br><br>\n\n<style>\ntable, th, td {\n    border: 1px solid #ddd;\n    font-size: 100%;\n    padding: 20px;\n}\n</style>\n\n<table style=\"width:100%\">\n    <thead>\n        <tr>\n            <th style=\"text-align:left; padding:20px; width:130px\">File Type</th>\n            <th style=\"text-align:left; padding:20px\">File Format</th>\n            <th style=\"text-align:left; padding:20px\">Description</th>\n        </tr>\n    </thead>\n    <tr>\n        <td style=\"padding:20px\">Alignment</td>\n        <td style=\"padding:20px\">.bam </td>\n        <td style=\"padding:20px\">Filtered alignments of raw reads to the appropriate reference are available in .bam format. These are generated separately for each replicate experiment in an experiment set.</td>\n    </tr>\n    <tr>\n        <td style=\"padding:20px\">Contact List</td>\n        <td style=\"padding:20px\"><a href=\"https://github.com/4dn-dcic/pairix/blob/master/pairs_format_specification.md\">.pairs</a></td>\n        <td style=\"padding:20px\">Contain a list of pairwise contacts and are generated after alignments are filtered. Pairs are generated separately for each replicate, and an additional pairs file is generated after replicates are merged. Quality metrics generated by <a href=\"https://github.com/4dn-dcic/pairsqc\">PairsQC</a> are also provided for each pairs file (a sample report can be found <a href=\"https://s3.amazonaws.com/4dn-github-related-files/pairsqc/test_report_d3_v4/pairsqc_report.html\">here</a>).</td>\n    </tr>\n    <tr>\n        <td style=\"padding:20px\">Contact Matrix</td>\n        <td style=\"padding:20px\"><a href=\"https://cooler.readthedocs.io/en/latest/datamodel.html\">.mcool</a></td>\n        <td style=\"padding:20px\">Multi-resolution contact matrices generated in the final step of the processing pipeline are available in .mcool format, generated by <a href=\"https://cooler.readthedocs.io/en/latest/\">Cooler</a>. These can be visualized on the portal with <a href=\"https://higlass.io\">HiGlass</a>. Generally, mcool contact matrices are normalized with the ICE matrix balancing algorithm (iterative correction and eigenvalue decomposition), though this step is not performed for Capture Hi-C or ChIA-PET. </td>\n    </tr>\n    <tr>\n        <td style=\"padding:20px\">Contact Matrix</td>\n        <td style=\"padding:20px\">.hic</td>\n        <td style=\"padding:20px\">Contact matrices generated by <a href=\"https://github.com/aidenlab/juicer/wiki/Pre\">Juicertools</a> are also available in .hic format, and these can be visualized externally in <a href=\"https://www.aidenlab.org/juicebox/\">Juicebox</a> or in the <a href=\"http://epigenomegateway.wustl.edu/browser/\">WashU Epigenome Browser</a>. For in situ Hi-C and dilution Hi-C, the .hic matrix is normalized using the VC, VC_SQRT, KR methods, after filtering out intra-fragment contacts (contacts that fall in the same restriction fragment). For DNase Hi-C and Micro-C, this filtering step is not performed, as these protocols don't use a restriction enzyme for digestion.</td>\n    </tr>\n</table>", "display_title": "hic-processed-files-html", "uuid": "ca83de05-1055-4641-8fb3-42031085636b", "@type": ["StaticSection", "UserContent", "Item"], "award": {"status": "current", "@type": ["Award", "Item"], "@id": "/awards/1U01CA200059-01/", "uuid": "b0b9c607-f8b4-4f02-93f4-9895b461334b", "display_title": "4D NUCLEOME NETWORK DATA COORDINATION AND INTEGRATION CENTER - PHASE I", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin"]}}, "status": "released", "lab": {"status": "current", "@id": "/labs/4dn-dcic-lab/", "@type": ["Lab", "Item"], "display_title": "4DN DCIC, HMS", "uuid": "828cd4fe-ebb0-4b36-a94a-d2e3a36cc989", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.lab_submitter", "submits_for.828cd4fe-ebb0-4b36-a94a-d2e3a36cc989"]}}, "@id": "/static-sections/ca83de05-1055-4641-8fb3-42031085636b/", "content_as_html": "<div class=\"html-container\">Below is a description of the file formats available on the portal as outputs of the data processing pipeline. \n\n<br/><br/>\n<style>\ntable, th, td {\n    border: 1px solid #ddd;\n    font-size: 100%;\n    padding: 20px;\n}\n</style>\n<table style=\"width:100%\">\n<thead>\n<tr>\n<th style=\"text-align:left; padding:20px; width:130px\">File Type</th>\n<th style=\"text-align:left; padding:20px\">File Format</th>\n<th style=\"text-align:left; padding:20px\">Description</th>\n</tr>\n</thead>\n<tr>\n<td style=\"padding:20px\">Alignment</td>\n<td style=\"padding:20px\">.bam </td>\n<td style=\"padding:20px\">Filtered alignments of raw reads to the appropriate reference are available in .bam format. These are generated separately for each replicate experiment in an experiment set.</td>\n</tr>\n<tr>\n<td style=\"padding:20px\">Contact List</td>\n<td style=\"padding:20px\"><a href=\"https://github.com/4dn-dcic/pairix/blob/master/pairs_format_specification.md\" rel=\"noopener noreferrer\" target=\"_blank\">.pairs</a></td>\n<td style=\"padding:20px\">Contain a list of pairwise contacts and are generated after alignments are filtered. Pairs are generated separately for each replicate, and an additional pairs file is generated after replicates are merged. Quality metrics generated by <a href=\"https://github.com/4dn-dcic/pairsqc\" rel=\"noopener noreferrer\" target=\"_blank\">PairsQC</a> are also provided for each pairs file (a sample report can be found <a href=\"https://s3.amazonaws.com/4dn-github-related-files/pairsqc/test_report_d3_v4/pairsqc_report.html\" rel=\"noopener noreferrer\" target=\"_blank\">here</a>).</td>\n</tr>\n<tr>\n<td style=\"padding:20px\">Contact Matrix</td>\n<td style=\"padding:20px\"><a href=\"https://cooler.readthedocs.io/en/latest/datamodel.html\" rel=\"noopener noreferrer\" target=\"_blank\">.mcool</a></td>\n<td style=\"padding:20px\">Multi-resolution contact matrices generated in the final step of the processing pipeline are available in .mcool format, generated by <a href=\"https://cooler.readthedocs.io/en/latest/\" rel=\"noopener noreferrer\" target=\"_blank\">Cooler</a>. These can be visualized on the portal with <a href=\"https://higlass.io\" rel=\"noopener noreferrer\" target=\"_blank\">HiGlass</a>. Generally, mcool contact matrices are normalized with the ICE matrix balancing algorithm (iterative correction and eigenvalue decomposition), though this step is not performed for Capture Hi-C or ChIA-PET. </td>\n</tr>\n<tr>\n<td style=\"padding:20px\">Contact Matrix</td>\n<td style=\"padding:20px\">.hic</td>\n<td style=\"padding:20px\">Contact matrices generated by <a href=\"https://github.com/aidenlab/juicer/wiki/Pre\" rel=\"noopener noreferrer\" target=\"_blank\">Juicertools</a> are also available in .hic format, and these can be visualized externally in <a href=\"https://www.aidenlab.org/juicebox/\" rel=\"noopener noreferrer\" target=\"_blank\">Juicebox</a> or in the <a href=\"http://epigenomegateway.wustl.edu/browser/\" rel=\"noopener noreferrer\" target=\"_blank\">WashU Epigenome Browser</a>. For in situ Hi-C and dilution Hi-C, the .hic matrix is normalized using the VC, VC_SQRT, KR methods, after filtering out intra-fragment contacts (contacts that fall in the same restriction fragment). For DNase Hi-C and Micro-C, this filtering step is not performed, as these protocols don't use a restriction enzyme for digestion.</td>\n</tr>\n</table></div>", "name": "hic-processed-files-html", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.owner", "userid.986b362f-4eb6-4a9c-8173-3ab267307e3a"]}}, "location": "tab:processed_files"}, {"content": {"options": {"filetype": "rst", "collapsible": false, "default_open": true, "convert_ext_links": true}, "filetype": "rst", "content": "Hi-C reads are mapped to the `GRCh38 </files-reference/4DNFIZQZ39L9/>`_ (human) or `mm10 </files-reference/4DNFI823LSI8/>`_ (mouse) reference genome using ``bwa`` version 0.7.17. Specifically, we run: ::\n\n  bwa mem -SP5M -t<nthreads> <genome_index> <fastq1> <fastq2>\n\n* The ``-SP`` option is used to ensure the results are equivalent to that obtained by running ``bwa mem`` on each mate separately, while retaining the right formatting for paired-end reads. This option skips a step in ``bwa mem`` that forces alignment of a poorly aligned read given an alignment of its mate with the assumption that the two mates are part of a single genomic segment.\n* The ``-5`` option is used to report the 5' portion of chimeric alignments as the primary alignment. In Hi-C experiments, when a mate has chimeric alignments, typically, the 5' portion is the position of interest, while the 3' portion represents the same fragment as the mate. For chimeric alignments, ``bwa mem`` reports two alignments: one of them is annotated as primary and soft-clipped, retaining the full-length of the original sequence. The other end is annotated as hard-clipped and marked as either 'supplementary' or 'secondary'. The ``-5`` option forces the 5'end to be always annotated as primary.\n* The ``-M`` option is used to annotate the secondary/supplementary clipped reads as **secondary** rather than **supplementary**, for compatibility with some public software tools such as ``picard MarkDuplicates``.\n* The ``-t`` option is used for multi-threading and  should not affect the result.", "display_title": "Alignment", "uuid": "10000000-0000-0000-0000-ffff00001000", "@type": ["StaticSection", "UserContent", "Item"], "award": {"status": "current", "@type": ["Award", "Item"], "@id": "/awards/1U01CA200059-01/", "uuid": "b0b9c607-f8b4-4f02-93f4-9895b461334b", "display_title": "4D NUCLEOME NETWORK DATA COORDINATION AND INTEGRATION CENTER - PHASE I", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin"]}}, "status": "released", "lab": {"status": "current", "@id": "/labs/4dn-dcic-lab/", "@type": ["Lab", "Item"], "display_title": "4DN DCIC, HMS", "uuid": "828cd4fe-ebb0-4b36-a94a-d2e3a36cc989", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.lab_submitter", "submits_for.828cd4fe-ebb0-4b36-a94a-d2e3a36cc989"]}}, "@id": "/static-sections/10000000-0000-0000-0000-ffff00001000/", "content_as_html": "<div class=\"rst-container\"><p>Hi-C reads are mapped to the <a class=\"reference external\" href=\"/files-reference/4DNFIZQZ39L9/\">GRCh38</a> (human) or <a class=\"reference external\" href=\"/files-reference/4DNFI823LSI8/\">mm10</a> (mouse) reference genome using <code>bwa</code> version 0.7.17. Specifically, we run:</p><pre class=\"literal-block\">\nbwa mem -SP5M -t&lt;nthreads&gt; &lt;genome_index&gt; &lt;fastq1&gt; &lt;fastq2&gt;\n</pre><ul class=\"simple\"><li>The <code>-SP</code> option is used to ensure the results are equivalent to that obtained by running <code>bwa mem</code> on each mate separately, while retaining the right formatting for paired-end reads. This option skips a step in <code>bwa mem</code> that forces alignment of a poorly aligned read given an alignment of its mate with the assumption that the two mates are part of a single genomic segment.</li><li>The <code>-5</code> option is used to report the 5' portion of chimeric alignments as the primary alignment. In Hi-C experiments, when a mate has chimeric alignments, typically, the 5' portion is the position of interest, while the 3' portion represents the same fragment as the mate. For chimeric alignments, <code>bwa mem</code> reports two alignments: one of them is annotated as primary and soft-clipped, retaining the full-length of the original sequence. The other end is annotated as hard-clipped and marked as either 'supplementary' or 'secondary'. The <code>-5</code> option forces the 5'end to be always annotated as primary.</li><li>The <code>-M</code> option is used to annotate the secondary/supplementary clipped reads as <strong>secondary</strong> rather than <strong>supplementary</strong>, for compatibility with some public software tools such as <code>picard MarkDuplicates</code>.</li><li>The <code>-t</code> option is used for multi-threading and  should not affect the result.</li></ul></div>", "name": "resources.data-analysis.hi_c-processing-pipeline.alignment", "title": "Alignment", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.owner", "userid.986b362f-4eb6-4a9c-8173-3ab267307e3a"]}}, "location": "tab:data_processing"}, {"content": {"options": {"filetype": "md", "collapsible": false, "default_open": true, "convert_ext_links": true}, "filetype": "md", "content": "For filtering valid Hi-C alignments, we use [`pairtools`](https://github.com/mirnylab/pairtools) (previously called `pairsamtools`). Specifically, we use [version `0.2.2`](https://github.com/mirnylab/pairtools/tree/v0.2.2). The filtering workflow outputs a [pairs](https://github.com/4dn-dcic/pairix/blob/master/pairs_format_specification.md) file containing a list of valid contacts. \n\nThis filtering workflow applies the following criteria:\n\n* Reads marked as duplicates are removed.\n* Full-length alignments that are unique are kept.\n   * An unmapped portion shorter than 20bp is ignored; and the rest of the alignment is still considered as full-length.\n*  In addition, clipped (chimeric) alignments are kept, if they are valid Hi-C contacts. If one mate is clipped and the other is full-length and the 3'end of the clipped alignment is mapped within 2kb of the full-length alignment in the orientation that the two 3'ends are pointing toward each other they are considered valid contacts.\n   * As with full-length alignments, any unmapped portion shorter than 20bp is ignored.\n\nOne of the design choices we have made is to include a lossless bam file as an output of the data processing. This output file, containing all the sequences in the original fastq files, the alignment results, and pairtools-provided flags for read filtering, is provided as a resource. To be able to produce this output file, the contents of the bam file is carried forward in the filtering workflow in intermediate `pairsam` files. Users who are only interested in the valid contact lists may run the same analysis with more light-weight intermediate files.\n\nSpecifically, the filtering workflow consists of the following steps:\n\n* `pairtools parse`\n   * A bam file is read in, and a [pairsam](https://pairsamtools.readthedocs.io/en/latest/pairsam.html) file is written out.\n   * The pairsam file is a pairs file, listing one read pair per line, with additional columns to track the sam-file lines, and a pairtools read classification.\n   * These classifications include information on whether the read aligned to 0, 1, or multiple places in the genome and whether it aligned end-to-end or if it was clipped.\n   * This tool also upper-triangularizes the reads, i.e. if the coordinate of second read is higher than the first, the reads are flipped.\n   * For more details, see [pairsamtools doc](https://pairsamtools.readthedocs.io/en/latest/parsing.html)\n\n* `pairtools sort`\n   * A `pairsam` (or generically `pairs`) file is read in, and a `pairsam` file is written out.\n   * The rows are sorted in chr1-chr2-pos1-pos2 order.\n   * Note that the flipping order and sort order of chromosomes is not identical. See [the docs](https://pairtools.readthedocs.io/en/latest/sorting.html#chromosomal-order-for-sorting-and-flipping) for more details.\n\n* `pairtools merge`\n   * One or more `pairsam` (or generically `pairs`) files are read in, and a `pairsam` file is written out.\n   * The files are merged, preserving the sorted order.\n\n* `pairtools dedup --mark-dups`\n   * (equivalent to `pairtools markasdup`)\n   * Duplicate alignments that share the same pair of 5'end coordinates +/- 3 bps are marked as identified.\n   * An arbitrary one is retained with the original classification, while others get a duplicate classification.\n\n* `pairtools select`\n   * Only the reads with pairtools classification `UU` and `UC` are retained and output to a `pairs` file.", "display_title": "Filtering", "uuid": "10000000-0000-0000-0000-ffff00001001", "@type": ["StaticSection", "UserContent", "Item"], "award": {"status": "current", "@type": ["Award", "Item"], "@id": "/awards/1U01CA200059-01/", "uuid": "b0b9c607-f8b4-4f02-93f4-9895b461334b", "display_title": "4D NUCLEOME NETWORK DATA COORDINATION AND INTEGRATION CENTER - PHASE I", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin"]}}, "status": "released", "lab": {"status": "current", "@id": "/labs/4dn-dcic-lab/", "@type": ["Lab", "Item"], "display_title": "4DN DCIC, HMS", "uuid": "828cd4fe-ebb0-4b36-a94a-d2e3a36cc989", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.lab_submitter", "submits_for.828cd4fe-ebb0-4b36-a94a-d2e3a36cc989"]}}, "@id": "/static-sections/10000000-0000-0000-0000-ffff00001001/", "content_as_html": "<div class=\"markdown-container\"><p>For filtering valid Hi-C alignments, we use <a href=\"https://github.com/mirnylab/pairtools\" rel=\"noopener noreferrer\" target=\"_blank\"><code>pairtools</code></a> (previously called <code>pairsamtools</code>). Specifically, we use <a href=\"https://github.com/mirnylab/pairtools/tree/v0.2.2\" rel=\"noopener noreferrer\" target=\"_blank\">version <code>0.2.2</code></a>. The filtering workflow outputs a <a href=\"https://github.com/4dn-dcic/pairix/blob/master/pairs_format_specification.md\" rel=\"noopener noreferrer\" target=\"_blank\">pairs</a> file containing a list of valid contacts. </p>\n<p>This filtering workflow applies the following criteria:</p>\n<ul>\n<li>Reads marked as duplicates are removed.</li>\n<li>Full-length alignments that are unique are kept.</li>\n<li>An unmapped portion shorter than 20bp is ignored; and the rest of the alignment is still considered as full-length.</li>\n<li>In addition, clipped (chimeric) alignments are kept, if they are valid Hi-C contacts. If one mate is clipped and the other is full-length and the 3'end of the clipped alignment is mapped within 2kb of the full-length alignment in the orientation that the two 3'ends are pointing toward each other they are considered valid contacts.</li>\n<li>As with full-length alignments, any unmapped portion shorter than 20bp is ignored.</li>\n</ul>\n<p>One of the design choices we have made is to include a lossless bam file as an output of the data processing. This output file, containing all the sequences in the original fastq files, the alignment results, and pairtools-provided flags for read filtering, is provided as a resource. To be able to produce this output file, the contents of the bam file is carried forward in the filtering workflow in intermediate <code>pairsam</code> files. Users who are only interested in the valid contact lists may run the same analysis with more light-weight intermediate files.</p>\n<p>Specifically, the filtering workflow consists of the following steps:</p>\n<ul>\n<li><code>pairtools parse</code></li>\n<li>A bam file is read in, and a <a href=\"https://pairsamtools.readthedocs.io/en/latest/pairsam.html\" rel=\"noopener noreferrer\" target=\"_blank\">pairsam</a> file is written out.</li>\n<li>The pairsam file is a pairs file, listing one read pair per line, with additional columns to track the sam-file lines, and a pairtools read classification.</li>\n<li>These classifications include information on whether the read aligned to 0, 1, or multiple places in the genome and whether it aligned end-to-end or if it was clipped.</li>\n<li>This tool also upper-triangularizes the reads, i.e. if the coordinate of second read is higher than the first, the reads are flipped.</li>\n<li>\n<p>For more details, see <a href=\"https://pairsamtools.readthedocs.io/en/latest/parsing.html\" rel=\"noopener noreferrer\" target=\"_blank\">pairsamtools doc</a></p>\n</li>\n<li>\n<p><code>pairtools sort</code></p>\n</li>\n<li>A <code>pairsam</code> (or generically <code>pairs</code>) file is read in, and a <code>pairsam</code> file is written out.</li>\n<li>The rows are sorted in chr1-chr2-pos1-pos2 order.</li>\n<li>\n<p>Note that the flipping order and sort order of chromosomes is not identical. See <a href=\"https://pairtools.readthedocs.io/en/latest/sorting.html#chromosomal-order-for-sorting-and-flipping\" rel=\"noopener noreferrer\" target=\"_blank\">the docs</a> for more details.</p>\n</li>\n<li>\n<p><code>pairtools merge</code></p>\n</li>\n<li>One or more <code>pairsam</code> (or generically <code>pairs</code>) files are read in, and a <code>pairsam</code> file is written out.</li>\n<li>\n<p>The files are merged, preserving the sorted order.</p>\n</li>\n<li>\n<p><code>pairtools dedup --mark-dups</code></p>\n</li>\n<li>(equivalent to <code>pairtools markasdup</code>)</li>\n<li>Duplicate alignments that share the same pair of 5'end coordinates +/- 3 bps are marked as identified.</li>\n<li>\n<p>An arbitrary one is retained with the original classification, while others get a duplicate classification.</p>\n</li>\n<li>\n<p><code>pairtools select</code></p>\n</li>\n<li>Only the reads with pairtools classification <code>UU</code> and <code>UC</code> are retained and output to a <code>pairs</code> file.</li>\n</ul></div>", "name": "resources.data-analysis.hi_c-processing-pipeline.filtering", "title": "Filtering", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.owner", "userid.986b362f-4eb6-4a9c-8173-3ab267307e3a"]}}, "location": "tab:data_processing"}, {"content": {"options": {"filetype": "md", "collapsible": false, "default_open": true}, "filetype": "md", "content": "For Capture Hi-C, the Hi-C pipeline is run with the default \nrestriction enzyme-based intra-fragment contact filtering, \nbut matrix balancing is not performed.\n\n4DN DCIC provides a Hi-C matrix in two different formats: \n`.mcool` format and `.hic` format. The two files are generated\n from the same `pairs` file as input filtered contact list. \nBoth files contain multiple resolutions.\n\n** .hic format**\n\n  * A `.hic` file is produced by [Juicertools](https://github.com/theaidenlab/Juicer/wiki/Pre) (version 1.8.9-cuda8) \nand can be visualized using [Juicebox](https://www.aidenlab.org/juicebox/)\n  * The matrix is normalized using the VC, VC_SQRT, KR methods.\n\n** .mcool format**\n\n  * An `.mcool` file is produced by [Cooler](https://github.com/mirnylab/cooler) (version 0.7.6) and can be visualized using [HiGlass](http://higlass.io/).\n  * The diagonal and the rows/columns with a low value are \nremoved from the matrix.\n  * The `.mcool` file also contains the normalization vectors \ngenerated by *Juicertools* (same as in a `.hic` file generated\n from the same `pairs` file)\n\n<br>\n\n**Resolutions**: Both `mcool` and `hic` files contain the \nfollowing resolutions.\n\n* 1kb, 2kb, 5kb, 10kb, 25kb, 50kb, 100kb, 250kb, 500kb, 1Mb, \n2.5Mb, 5Mb, 10Mb\n\n", "display_title": "Matrix Aggregation and Normalization", "uuid": "4b8d6e04-0a69-4a08-aafc-a4a99b45b941", "@type": ["StaticSection", "UserContent", "Item"], "award": {"status": "current", "@type": ["Award", "Item"], "@id": "/awards/1U01CA200059-01/", "uuid": "b0b9c607-f8b4-4f02-93f4-9895b461334b", "display_title": "4D NUCLEOME NETWORK DATA COORDINATION AND INTEGRATION CENTER - PHASE I", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin"]}}, "status": "released", "lab": {"status": "current", "@id": "/labs/4dn-dcic-lab/", "@type": ["Lab", "Item"], "display_title": "4DN DCIC, HMS", "uuid": "828cd4fe-ebb0-4b36-a94a-d2e3a36cc989", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.lab_submitter", "submits_for.828cd4fe-ebb0-4b36-a94a-d2e3a36cc989"]}}, "@id": "/static-sections/4b8d6e04-0a69-4a08-aafc-a4a99b45b941/", "content_as_html": "<div class=\"markdown-container\"><p>For Capture Hi-C, the Hi-C pipeline is run with the default \nrestriction enzyme-based intra-fragment contact filtering, \nbut matrix balancing is not performed.</p>\n<p>4DN DCIC provides a Hi-C matrix in two different formats: \n<code>.mcool</code> format and <code>.hic</code> format. The two files are generated\n from the same <code>pairs</code> file as input filtered contact list. \nBoth files contain multiple resolutions.</p>\n<p>** .hic format**</p>\n<ul>\n<li>A <code>.hic</code> file is produced by <a href=\"https://github.com/theaidenlab/Juicer/wiki/Pre\" rel=\"noopener noreferrer\" target=\"_blank\">Juicertools</a> (version 1.8.9-cuda8) \nand can be visualized using <a href=\"https://www.aidenlab.org/juicebox/\" rel=\"noopener noreferrer\" target=\"_blank\">Juicebox</a></li>\n<li>The matrix is normalized using the VC, VC_SQRT, KR methods.</li>\n</ul>\n<p>** .mcool format**</p>\n<ul>\n<li>An <code>.mcool</code> file is produced by <a href=\"https://github.com/mirnylab/cooler\" rel=\"noopener noreferrer\" target=\"_blank\">Cooler</a> (version 0.7.6) and can be visualized using <a href=\"http://higlass.io/\" rel=\"noopener noreferrer\" target=\"_blank\">HiGlass</a>.</li>\n<li>The diagonal and the rows/columns with a low value are \nremoved from the matrix.</li>\n<li>The <code>.mcool</code> file also contains the normalization vectors \ngenerated by <em>Juicertools</em> (same as in a <code>.hic</code> file generated\n from the same <code>pairs</code> file)</li>\n</ul>\n<p><br/></p>\n<p><strong>Resolutions</strong>: Both <code>mcool</code> and <code>hic</code> files contain the \nfollowing resolutions.</p>\n<ul>\n<li>1kb, 2kb, 5kb, 10kb, 25kb, 50kb, 100kb, 250kb, 500kb, 1Mb, \n2.5Mb, 5Mb, 10Mb</li>\n</ul></div>", "name": "exptype-capc-matrix-agg", "title": "Matrix Aggregation and Normalization", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.owner", "userid.e2324f87-0625-4bbc-803b-d47677aebe08"]}}, "location": "tab:data_processing"}, {"content": {"options": {"filetype": "md", "collapsible": false, "default_open": true}, "filetype": "md", "content": "The pipeline components are pre-installed in a publicly available Docker image (`duplexa/4dn-hic:v43`) on [Docker Hub](https://hub.docker.com/r/duplexa/4dn-hic/). The source code for the Docker image and pipeline description in Common Workflow Language (CWL) can be found on GitHub.\n\n* Latest runs\n\n  Content-wise, `0.2.5`, `0.2.6`, `0.2.7` can be considered (nearly) identical.\n\n  * `0.2.7`\n    * CWL : https://github.com/4dn-dcic/docker-4dn-hic/tree/v43/cwl\n    * Docker : https://github.com/4dn-dcic/docker-4dn-hic/tree/v43\n  * `0.2.5`/`0.2.6`\n    * CWL : https://github.com/4dn-dcic/docker-4dn-hic/tree/v42.2/cwl\n    * Docker : https://github.com/4dn-dcic/docker-4dn-hic/tree/v42.2\n* Old runs\n  * `0.2.0`\n    * CWL : https://github.com/4dn-dcic/pipelines-cwl/tree/0.2.0/cwl_awsem/\n    * Docker : https://github.com/4dn-dcic/docker-4dn-hic/tree/v40\n\nExample set of commands that were actually run as part of the pipeline can be found at https://github.com/4dn-dcic/docker-4dn-hic/blob/v43/HiCPipeline.md", "display_title": "Source files", "uuid": "0457856c-a342-4781-832e-bb1b95c03290", "@type": ["StaticSection", "UserContent", "Item"], "award": {"status": "current", "@type": ["Award", "Item"], "@id": "/awards/1U01CA200059-01/", "uuid": "b0b9c607-f8b4-4f02-93f4-9895b461334b", "display_title": "4D NUCLEOME NETWORK DATA COORDINATION AND INTEGRATION CENTER - PHASE I", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin"]}}, "status": "released", "lab": {"status": "current", "@id": "/labs/4dn-dcic-lab/", "@type": ["Lab", "Item"], "display_title": "4DN DCIC, HMS", "uuid": "828cd4fe-ebb0-4b36-a94a-d2e3a36cc989", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.lab_submitter", "submits_for.828cd4fe-ebb0-4b36-a94a-d2e3a36cc989"]}}, "@id": "/static-sections/0457856c-a342-4781-832e-bb1b95c03290/", "content_as_html": "<div class=\"markdown-container\"><p>The pipeline components are pre-installed in a publicly available Docker image (<code>duplexa/4dn-hic:v43</code>) on <a href=\"https://hub.docker.com/r/duplexa/4dn-hic/\" rel=\"noopener noreferrer\" target=\"_blank\">Docker Hub</a>. The source code for the Docker image and pipeline description in Common Workflow Language (CWL) can be found on GitHub.</p>\n<ul>\n<li>Latest runs</li>\n</ul>\n<p>Content-wise, <code>0.2.5</code>, <code>0.2.6</code>, <code>0.2.7</code> can be considered (nearly) identical.</p>\n<ul>\n<li><code>0.2.7</code><ul>\n<li>CWL : https://github.com/4dn-dcic/docker-4dn-hic/tree/v43/cwl</li>\n<li>Docker : https://github.com/4dn-dcic/docker-4dn-hic/tree/v43</li>\n</ul>\n</li>\n<li><code>0.2.5</code>/<code>0.2.6</code><ul>\n<li>CWL : https://github.com/4dn-dcic/docker-4dn-hic/tree/v42.2/cwl</li>\n<li>Docker : https://github.com/4dn-dcic/docker-4dn-hic/tree/v42.2</li>\n</ul>\n</li>\n<li>Old runs</li>\n<li><code>0.2.0</code><ul>\n<li>CWL : https://github.com/4dn-dcic/pipelines-cwl/tree/0.2.0/cwl_awsem/</li>\n<li>Docker : https://github.com/4dn-dcic/docker-4dn-hic/tree/v40</li>\n</ul>\n</li>\n</ul>\n<p>Example set of commands that were actually run as part of the pipeline can be found at https://github.com/4dn-dcic/docker-4dn-hic/blob/v43/HiCPipeline.md</p></div>", "name": "resources.data-analysis.hi_c-processing-pipeline.source-files", "title": "Source files", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin", "role.owner", "userid.986b362f-4eb6-4a9c-8173-3ab267307e3a"]}}, "location": "tab:data_processing"}], "controlled_term": {"term_url": "http://www.ebi.ac.uk/efo/EFO_0008674", "uuid": "5e9dcd04-e577-4915-b868-65095167dc27", "term_id": "EFO:0008674", "@id": "/ontology-terms/EFO:0008674/", "term_name": "Capture-HiC", "status": "released", "preferred_name": "Capture-HiC", "display_title": "Capture-HiC", "@type": ["OntologyTerm", "Item"], "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin"]}}, "experiment_name": "capture-hi-c", "current_pipeline": "HiC_Pipeline_0.3.0", "valid_item_types": ["ExperimentCaptureC"], "accepted_pipelines": ["HiC_Pipeline_0.2.6", "HiC_Pipeline_0.2.7"], "experiment_category": "Sequencing", "assay_classification": "3C via Ligation", "assay_subclass_short": "Enrichment Hi-C", "assay_subclassification": "DNA-DNA Pairwise Interactions of Enriched Regions", "@id": "/experiment-types/capture-hi-c/", "@type": ["ExperimentType", "Item"], "uuid": "62ddb15e-4907-4b8a-bfca-cdd4371cb179", "principals_allowed": {"view": ["system.Everyone"], "edit": ["group.admin"]}, "display_title": "Capture Hi-C", "external_references": [], "@context": "/terms/", "aggregated-items": {}, "validation-errors": []}