{"@context":{"@language":"en","@vocab":"https://schema.org/","arrayShape":"cr:arrayShape","citeAs":"cr:citeAs","column":"cr:column","conformsTo":"dct:conformsTo","cr":"http://mlcommons.org/croissant/","data":{"@id":"cr:data","@type":"@json"},"dataBiases":"cr:dataBiases","dataCollection":"cr:dataCollection","dataType":{"@id":"cr:dataType","@type":"@vocab"},"dct":"http://purl.org/dc/terms/","extract":"cr:extract","field":"cr:field","fileProperty":"cr:fileProperty","fileObject":"cr:fileObject","fileSet":"cr:fileSet","format":"cr:format","includes":"cr:includes","isArray":"cr:isArray","isLiveDataset":"cr:isLiveDataset","jsonPath":"cr:jsonPath","key":"cr:key","md5":"cr:md5","parentField":"cr:parentField","path":"cr:path","personalSensitiveInformation":"cr:personalSensitiveInformation","recordSet":"cr:recordSet","references":"cr:references","regex":"cr:regex","repeated":"cr:repeated","replace":"cr:replace","sc":"https://schema.org/","separator":"cr:separator","source":"cr:source","subField":"cr:subField","transform":"cr:transform"},"@type":"sc:Dataset","distribution":[{"@type":"cr:FileObject","@id":"repo","name":"repo","description":"The Hugging Face git repository.","contentUrl":"ANONYMIZED","encodingFormat":"git+https","sha256":"https://github.com/mlcommons/croissant/issues/80"},{"@type":"cr:FileSet","@id":"parquet-files-for-config-default","containedIn":{"@id":"repo"},"encodingFormat":"application/x-parquet","includes":"default/*/*.parquet"}],"recordSet":[{"@type":"cr:RecordSet","dataType":"cr:Split","key":{"@id":"default_splits/split_name"},"@id":"default_splits","name":"default_splits","description":"Splits for the default config.","field":[{"@type":"cr:Field","@id":"default_splits/split_name","dataType":"sc:Text"}],"data":[{"default_splits/split_name":"test"}]},{"@type":"cr:RecordSet","@id":"default","description":"ANONYMIZED - 'default' subset","field":[{"@type":"cr:Field","@id":"default/split","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"fileProperty":"fullpath"},"transform":{"regex":"default/(?:partial-)?(test)/.+parquet$"}},"references":{"field":{"@id":"default_splits/split_name"}}},{"@type":"cr:Field","@id":"default/strain_name","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"strain_name"}}},{"@type":"cr:Field","@id":"default/contig_name","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"contig_name"}},"isArray":true,"arrayShape":"-1"},{"@type":"cr:Field","@id":"default/gene_name","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"gene_name"}},"isArray":true,"arrayShape":"-1,-1"},{"@type":"cr:Field","@id":"default/locus_tag","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"locus_tag"}},"isArray":true,"arrayShape":"-1,-1"},{"@type":"cr:Field","@id":"default/start","dataType":"cr:Int64","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"start"}},"isArray":true,"arrayShape":"-1,-1"},{"@type":"cr:Field","@id":"default/end","dataType":"cr:Int64","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"end"}},"isArray":true,"arrayShape":"-1,-1"},{"@type":"cr:Field","@id":"default/strand","dataType":"cr:Int64","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"strand"}},"isArray":true,"arrayShape":"-1,-1"},{"@type":"cr:Field","@id":"default/protein_id","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"protein_id"}},"isArray":true,"arrayShape":"-1,-1"},{"@type":"cr:Field","@id":"default/protein_sequence","dataType":"sc:Text","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"protein_sequence"}},"isArray":true,"arrayShape":"-1,-1"},{"@type":"cr:Field","@id":"default/operon_prot_indices","dataType":"cr:Int64","source":{"fileSet":{"@id":"parquet-files-for-config-default"},"extract":{"column":"operon_prot_indices"}},"isArray":true,"arrayShape":"-1,-1,-1"}]}],"conformsTo":"http://mlcommons.org/croissant/1.1","name":"operon-identification-long-read-rna-sequencing-protein-sequences","description":"\n\t\n\t\t\n\t\tDataset for operon identification from long-read RNA sequencing\n\t\n\nA dataset of annotated operons across 5 distinct bacterial strains. The operons were annotated by running and analysing long-read RNA sequencing and identifying genes\nlocated on the same transcripts.\nThe genome protein sequences have been extracted from GenBank. Each row contains whole bacterial genome represented by an ordered list\nof protein sequences.\n\n\t\n\t\t\n\t\n\t\n\t\tUsage\n\t\n\nFor a complete example on how to read and use… See the full description on the dataset page: ANONYMIZED.","alternateName":["ANONYMIZED","Operon identification dataset from long-read RNA sequencing"],"creator":{"@type":"Person","name":"ANONYMIZED","url":"ANONYMIZED"},"keywords":["apache-2.0","< 1K","parquet","Text","Datasets","pandas","Croissant","Polars","🇺🇸 Region: US","operon","bacteria","bacformer","prokaryotes","genomics"],"license":"https://choosealicense.com/licenses/apache-2.0/","url":"ANONYMIZED"}