In [1]:
import hashlib
import os
import shutil
import subprocess
import uuid
from typing import Optional, List
import gzip
import json
from multiprocessing.pool import ThreadPool

import pandas as pd
import requests
from cooler import Cooler
from pydantic import BaseModel

In [2]:
class HiCExperiment(BaseModel):
    """A class to represent a Hi-C experiment"""

    accession: Optional[str] = None
    assay_term_name: Optional[str] = None
    assay_title: Optional[str] = None
    award: Optional[dict] = None
    biosample: Optional[str] = None
    biosample_ontology: Optional[dict] = None
    biosample_summary: Optional[str] = None
    dbxrefs: Optional[list] = None
    description: Optional[str] = None
    lab: Optional[dict] = None
    replicates: Optional[list] = None
    status: Optional[str] = None
    hic_accession: Optional[str] = None

    def __repr__(self):
        return f"{self.biosample_summary} - {self.accession}: {self.hic_accession}"


class HiCExperimentList(BaseModel):
    experiments: List[HiCExperiment]


def download_encode(accession, extension):
    """Download an ENCODE file by its accession and store the result on disk"""
    url = f"https://www.encodeproject.org/files/{accession}/@@download/{accession}.{extension}"
    local_filename = f"/net/common/cache/encode/GRCh38/{accession}.{extension}"
    if os.path.exists(local_filename):
        return local_filename
    with requests.get(url, stream=True, allow_redirects=True) as r:
        temp_file = str(uuid.uuid4()) + f".{extension}"
        print(f"Downloading {accession} to {temp_file}")
        with open(temp_file, "wb") as f:
            shutil.copyfileobj(r.raw, f)
        shutil.move(temp_file, local_filename)
    return local_filename


def verify_encode_file(accession, extension):
    """Verify md5 checksum of encode file"""
    url = f"https://www.encodeproject.org/files/{accession}/?format=json"
    response = requests.get(url)
    payload = response.json()
    md5sum = payload["md5sum"]
    local_filename = download_encode(accession, extension)

    # Get md5sum of file
    with open(local_filename, "rb") as f:
        file_hash = hashlib.md5()
        while chunk := f.read(67108864):  # 64 MB chunks
            file_hash.update(chunk)
        file_md5sum = file_hash.hexdigest()

    # Check md5sum matches
    if md5sum != file_md5sum:
        os.remove(local_filename)
        print(f"MD5 checksums don't match for {accession}!")
        raise Exception(f"MD5 checksums don't match for {accession}!")
    print(f"Verified MD5 checksum for {accession}.{extension}")
    return local_filename


def hic_to_mcool(accession):
    """Convert a hic file to a mcool file"""
    mcool_file = f"{accession}.mcool"
    if os.path.exists(mcool_file):
        try:
            clr = Cooler(f"{mcool_file}::/resolutions/50000")
            print(clr.info)
            return mcool_file
        except OSError as e:
            os.remove(mcool_file)
            raise e

    hic_file = verify_encode_file(accession, "hic")
    temp_name = str(uuid.uuid4())
    temp_hic = f"{temp_name}.hic"
    temp_mcool = f"{temp_name}.mcool"
    shutil.copy(hic_file, temp_hic)
    subprocess.run(["/home/reimonnt/miniconda3/envs/hictk/bin/hictk", "convert", temp_hic, temp_mcool], capture_output=False)
    shutil.move(temp_mcool, mcool_file)
    os.remove(temp_hic)
    return mcool_file

In [3]:
primary_cell = pd.read_csv("metadata-primary-cell-samples.csv")
tissues = pd.read_csv("metadata-tissue-samples.csv")

In [4]:
metadata = pd.concat([primary_cell, tissues])
metadata.head()

Unnamed: 0.1,Unnamed: 0,accession,assay_term_name,assay_title,award,biosample,biosample_ontology,biosample_summary,dbxrefs,description,...,format,format-url,format-version,generated-by,metadata,nbins,nchroms,nnz,storage-mode,sum
0,10,ENCSR711AVS,HiC,in situ Hi-C,{'project': 'ENCODE'},mammary epithelial cell,"{'term_name': 'mammary epithelial cell', 'clas...",Homo sapiens mammary epithelial cell,[],HMEC in situ Hi-C experiment,...,HDF5::Cooler,https://github.com/open2c/cooler,3,hictk-v0.0.10-bioconda,{},61776,25,304004037,symmetric-upper,1286376999
1,14,ENCSR351NAI,HiC,intact Hi-C,{'project': 'ENCODE'},"activated CD8-positive, alpha-beta T cell","{'term_name': 'activated CD8-positive, alpha-b...","Homo sapiens activated CD8-positive, alpha-bet...",['GEO:GSE237357'],CD8+ activated intact Hi-C,...,HDF5::Cooler,https://github.com/open2c/cooler,3,hictk-v0.0.10-bioconda,{},61776,25,281321644,symmetric-upper,1009581526
2,17,ENCSR923PPH,HiC,intact Hi-C,{'project': 'ENCODE'},"activated CD4-positive, alpha-beta T cell","{'term_name': 'activated CD4-positive, alpha-b...","Homo sapiens activated CD4-positive, alpha-bet...",['GEO:GSE238027'],naive CD4+ T cell Th0 effector _activated int...,...,HDF5::Cooler,https://github.com/open2c/cooler,3,hictk-v0.0.10-bioconda,{},61776,25,637408941,symmetric-upper,1577549156
3,22,ENCSR456QYU,HiC,intact Hi-C,{'project': 'ENCODE'},dendritic cell,"{'term_name': 'dendritic cell', 'classificatio...",Homo sapiens dendritic cell male adult (51 yea...,['GEO:GSE237482'],Monocytes Dendritic cell day 4 LP23 intact mna...,...,HDF5::Cooler,https://github.com/open2c/cooler,3,hictk-v0.0.10-bioconda,{},61776,25,371618574,symmetric-upper,1161869581
4,27,ENCSR236EYO,HiC,intact Hi-C,{'project': 'ENCODE'},CD14-positive monocyte,"{'term_name': 'CD14-positive monocyte', 'class...",Homo sapiens CD14-positive monocyte male adult...,['GEO:GSE237421'],Monocytes intact Hi-C,...,HDF5::Cooler,https://github.com/open2c/cooler,3,hictk-v0.0.10-bioconda,{},61776,25,417065358,symmetric-upper,1100063813


In [5]:
accessions = metadata.hic_accession.to_list()
print(f"Number of accessions: {len(accessions)}")

Number of accessions: 73


In [6]:
# accessions

In [None]:
# Download files
with ThreadPool(16) as pool:
    mcool_filelist = pool.map(hic_to_mcool, accessions)

Downloading ENCFF355VJW to cba8f783-67a6-4ae9-8eec-731bc0bed7c5.hic
Downloading ENCFF654YIQ to 2e33190d-af04-4759-8e06-0507f861361b.hic
Downloading ENCFF980NXK to 7ae43b63-d72f-43fd-8851-e076c05ae9df.hic
Downloading ENCFF273XBU to 07538917-a274-4642-9453-6b1f9d4c85cd.hic
Downloading ENCFF943JRY to a6edaded-8dcc-438b-a60d-bcd25522dcd1.hic
Downloading ENCFF579CAR to c5861dc9-d7a4-4c9c-99c0-86d0a3d94145.hic
Downloading ENCFF520GFL to 5d2aa7e6-15fc-4f47-a2b2-7b44fe25a943.hic
Downloading ENCFF783KQI to 846abe5c-2c77-4d8b-9c61-27e931cc4714.hic
Downloading ENCFF586MQY to e4f67c70-4fff-4218-a204-e3a17035a7cc.hic
Downloading ENCFF185AYZ to 63e78b92-7c80-4e13-96e9-c9c8104a8221.hic
Downloading ENCFF044TCQ to d24d1f98-b47a-4c5d-82cc-f5a7bbedc0f8.hic
Downloading ENCFF235LCO to 1ed2fe6a-e26d-441e-a16b-57fa23bf4864.hic
Downloading ENCFF076LWH to c7347a68-22ac-447a-b001-49be042c6414.hic
Downloading ENCFF962EDB to 4597bcf1-d31d-48da-b93c-9fafb984a383.hic
Downloading ENCFF952JZV to 998b885b-9c25-4ffd-84

[2024-09-10 11:06:09.491] [info]: Running hictk v1.0.0-bioconda
[2024-09-10 11:06:09.492] [info]: Converting cebe062e-5348-4498-aa61-8f5ef7e46279.hic to cebe062e-5348-4498-aa61-8f5ef7e46279.mcool (hic -> mcool)...
[2024-09-10 11:06:15.373] [info]: [100] begin processing 100bp matrix...
[2024-09-10 11:06:34.497] [info]: [100] processing chr1:22064200-22064300 at 523040 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:06:42.576] [info]: [100] processing chr1:42851400-42851500 at 1237930 pixels/s (cache hit rate 0.02%)...
[2024-09-10 11:06:51.663] [info]: [100] processing chr1:63329300-63329400 at 1100594 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:07:00.773] [info]: [100] processing chr1:85668400-85668500 at 1097695 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:07:08.980] [info]: [100] processing chr1:110274600-110274700 at 1218621 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:07:16.777] [info]: [100] processing chr1:161715200-161715300 at 1282709 pixels/s (cache hit rate 

Verified MD5 checksum for ENCFF943JRY.hic


[2024-09-10 11:09:42.704] [info]: [100] processing chr3:197551300-197551400 at 2748763 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:10:04.896] [info]: [100] processing chr4:22470400-22470500 at 450633 pixels/s (cache hit rate 0.01%)...
[2024-09-10 11:10:12.925] [info]: [100] processing chr4:46229600-46229700 at 1245485 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:10:21.425] [info]: [100] processing chr4:73738700-73738800 at 1176609 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:10:28.390] [info]: [100] processing chr4:98487500-98487600 at 1435750 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:10:34.812] [info]: [100] processing chr4:125394100-125394200 at 1557390 pixels/s (cache hit rate 0.03%)...
[2024-09-10 11:10:40.071] [info]: [100] processing chr4:153160500-153160600 at 1901864 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:10:42.289] [info]: Running hictk v1.0.0-bioconda
[2024-09-10 11:10:42.289] [info]: Converting 98c75b2e-5f32-42e0-bc8e-059a4bb4da9a.hic to 98

Downloading ENCFF625VNK to a2732efc-249d-4624-bbf4-440cf0e0d282.hic


[2024-09-10 11:17:30.347] [info]: [100] processing chr9:100890100-100890200 at 2336449 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:17:33.951] [info]: [100] processing chr9:124946400-124946500 at 2774695 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:17:42.259] [info]: [100] processing chr10:9790600-9790700 at 1203804 pixels/s (cache hit rate 0.03%)...
[2024-09-10 11:17:47.911] [info]: [100] processing chr10:31914700-31914800 at 1769598 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:17:53.872] [info]: [100] processing chr10:61607000-61607100 at 1677571 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:17:58.666] [info]: [100] processing chr10:84867700-84867800 at 2086376 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:18:02.680] [info]: [100] processing chr10:108917300-108917400 at 2491901 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:18:09.583] [info]: [100] processing chr11:1698800-1698900 at 1448646 pixels/s (cache hit rate 0.12%)...
[2024-09-10 11:18:16.091] [info]

Downloading ENCFF493SFI to 05ff0aac-dfd1-4d0a-ac05-91129108d948.hic
Verified MD5 checksum for ENCFF783KQI.hic


[2024-09-10 11:35:29.203] [info]: Running hictk v1.0.0-bioconda
[2024-09-10 11:35:29.203] [info]: Converting c9144acd-ec6d-40ec-bb7b-6737b7463167.hic to c9144acd-ec6d-40ec-bb7b-6737b7463167.mcool (hic -> mcool)...
[2024-09-10 11:36:15.403] [info]: [10] begin processing 10bp matrix...
[2024-09-10 11:36:39.096] [info]: [10] processing chr1:20362050-20362060 at 422155 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:36:47.205] [info]: [10] processing chr1:38924630-38924640 at 1233350 pixels/s (cache hit rate 0.02%)...
[2024-09-10 11:36:56.070] [info]: [10] processing chr1:56824170-56824180 at 1128032 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:37:03.446] [info]: [10] processing chr1:73105490-73105500 at 1355932 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:37:11.829] [info]: [10] processing chr1:91231630-91231640 at 1192890 pixels/s (cache hit rate 0.00%)...
[2024-09-10 11:37:18.800] [info]: [10] processing chr1:112956860-112956870 at 1434720 pixels/s (cache hit rate 0.00%)...
