{"@context":{"@language":"en","@vocab":"https://schema.org/","arrayShape":"cr:arrayShape","citeAs":"cr:citeAs","column":"cr:column","conformsTo":"dct:conformsTo","cr":"http://mlcommons.org/croissant/","data":{"@id":"cr:data","@type":"@json"},"dataBiases":"cr:dataBiases","dataCollection":"cr:dataCollection","dataType":{"@id":"cr:dataType","@type":"@vocab"},"dct":"http://purl.org/dc/terms/","extract":"cr:extract","field":"cr:field","fileProperty":"cr:fileProperty","fileObject":"cr:fileObject","fileSet":"cr:fileSet","format":"cr:format","includes":"cr:includes","isArray":"cr:isArray","isLiveDataset":"cr:isLiveDataset","jsonPath":"cr:jsonPath","key":"cr:key","md5":"cr:md5","parentField":"cr:parentField","path":"cr:path","personalSensitiveInformation":"cr:personalSensitiveInformation","recordSet":"cr:recordSet","references":"cr:references","regex":"cr:regex","repeated":"cr:repeated","replace":"cr:replace","sc":"https://schema.org/","separator":"cr:separator","source":"cr:source","subField":"cr:subField","transform":"cr:transform"},"@type":"sc:Dataset","distribution":[{"@type":"cr:FileObject","@id":"repo","name":"repo","description":"The Hugging Face git repository.","contentUrl":"ANONYMIZED","encodingFormat":"git+https","sha256":"https://github.com/mlcommons/croissant/issues/80"},{"@type":"cr:FileSet","@id":"parquet-files-for-config-default","containedIn":{"@id":"repo"},"encodingFormat":"application/x-parquet","includes":"default/*/*.parquet"}],"recordSet":[{"@type":"cr:RecordSet","dataType":"cr:Split","key":{"@id":"default_splits/split_name"},"@id":"default_splits","name":"default_splits","description":"Splits for the default config.","field":[{"@type":"cr:Field","@id":"default_splits/split_name","dataType":"sc:Text"}],"data":[{"default_splits/split_name":"train"}]},{"@type":"cr:RecordSet","@id":"default","description":"ANONYMIZED - 'default' subset (first 5GB)","field":[{"@type":"cr:Field","@id":"default/split","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"fileProperty":"fullpath"},"transform":{"regex":"default/(?:partial-)?(train)/.+parquet$"}},"references":{"field":{"@id":"default_splits/split_name"}}},{"@type":"cr:Field","@id":"default/genome_name","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"genome_name"}}},{"@type":"cr:Field","@id":"default/contig_name","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"contig_name"}},"isArray":true,"arrayShape":"-1"},{"@type":"cr:Field","@id":"default/protein_id","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"protein_id"}},"isArray":true,"arrayShape":"-1,-1"},{"@type":"cr:Field","@id":"default/protein_sequence","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"protein_sequence"}},"isArray":true,"arrayShape":"-1,-1"},{"@type":"cr:Field","@id":"default/taxid","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"taxid"}}},{"@type":"cr:Field","@id":"default/locus_tag","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"locus_tag"}},"isArray":true,"arrayShape":"-1,-1"},{"@type":"cr:Field","@id":"default/start","dataType":"cr:Int64","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"start"}},"isArray":true,"arrayShape":"-1,-1"},{"@type":"cr:Field","@id":"default/end","dataType":"cr:Int64","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"end"}},"isArray":true,"arrayShape":"-1,-1"},{"@type":"cr:Field","@id":"default/product","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"product"}},"isArray":true,"arrayShape":"-1,-1"}]}],"conformsTo":"http://mlcommons.org/croissant/1.1","name":"bacbench-antibiotic-resistance-protein-sequences","description":"\n\t\n\t\t\n\t\tDataset for antibiotic resistance prediction from whole-bacterial genomes (protein sequences)\n\t\n\nA dataset of 25,032 bacterial genomes across 39 species with antimicrobial resistance labels.\nThe genome protein sequences have been extracted from GenBank. Each row contains whole bacterial genome, with spaces\nseparating different contigs present in the genome.\nThe antimicrobial resistance labels have been extracted from Antibiotic Susceptibility Test (AST) Browser, accessed 23 Oct, 2024.)… See the full description on the dataset page: ANONYMIZED.","alternateName":["ANONYMIZED"],"creator":{"@type":"Person","name":"ANONYMIZED","url":"ANONYMIZED"},"keywords":["10K - 100K","parquet","Text","Datasets","Dask","Croissant","Polars","🇺🇸 Region: US"],"url":"ANONYMIZED"}