@misc{karamiSynEHRgySynthesizingMixedType2024, title = {{SynEHRgy}: {Synthesizing} {Mixed}-{Type} {Structured} {Electronic} {Health} {Records} using {Decoder}-{Only} {Transformers}}, shorttitle = {{SynEHRgy}}, url = {http://arxiv.org/abs/2411.13428}, doi = {10.48550/arXiv.2411.13428}, abstract = {Generating synthetic Electronic Health Records (EHRs) offers significant potential for data augmentation, privacy-preserving data sharing, and improving machine learning model training. We propose a novel tokenization strategy tailored for structured EHR data, which encompasses diverse data types such as covariates, ICD codes, and irregularly sampled time series. Using a GPT-like decoder-only transformer model, we demonstrate the generation of high-quality synthetic EHRs. Our approach is evaluated using the MIMIC-III dataset, and we benchmark the fidelity, utility, and privacy of the generated data against state-of-the-art models.}, urldate = {2026-06-17}, publisher = {arXiv}, author = {Karami, Hojjat and Atienza, David and Ionescu, Anisoara}, month = nov, year = {2024}, note = {arXiv:2411.13428 [cs.LG]}, keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, preprint}, }