@article{article_1718121, title={Performance of Generative AI Models on Cardiology Practice in Emergency Service: A Pilot Evaluation of GPT-4.o and Gemini-1.5-Flash}, journal={Journal of Uludağ University Medical Faculty}, volume={51}, pages={239–246}, year={2025}, DOI={10.32708/uutfd.1718121}, author={Günay Polatkan, Şeyda and Sığırlı, Deniz and Durak, Vahide Aslıhan and Alak, Çetin and Kan, Irem Iris}, keywords={kardiyoloji, karar verme, yapay zeka, GPT-4.o, Gemini-1.5-Flash}, abstract={In healthcare, emergent clinical decision-making is complex and large language models (LLMs) may enhance both the quality and efficiency of care by aiding physicians. Case scenario-based multiple choice questions (CS-MCQs) are valuable for testing analytical skills and knowledge integration. Moreover, readability is as important as content accuracy. This study aims to compare the diagnostic and treatment capabilities of GPT-4.o and Gemini-1.5-Flash and to evaluate the readability of the responses for cardiac emergencies. A total of 70 single-answer MCQs were randomly selected from the Medscape Case Challenges and ECG Challenges series. The questions were about cardiac emergencies and were further categorized into four subgroups according to whether the question included a case presentation or an image, or not. ChatGPT and Gemini platforms were used to assess the selected questions. The Flesch–Kincaid Grade Level (FKGL) and Flesch Reading Ease (FRE) scores were utilized to evaluate the readability of the responses. GPT-4.o had a correct response rate of 65.7%, outperforming Gemini-1.5-Flash, which had a 58.6% correct response rate (p=0.010). When comparing by question type, GPT-4.o was inferior to Gemini-1.5-Flash only for non-case questions (52.5% vs. 62.5%, p=0.011). For all other question types, there were no significant performance differences between the two models (p>0.05). Both models performed better on easy questions compared to difficult ones, and on questions without images compared to those with images. Additionally, while GPT-4.o performed better on case questions than non-case questions. Gemini-1.5-Flash’s FRE score was higher than GPT-4.o’s (median [min-max], 23.75 [0-64.60] vs. 17.0 [0-56.60], p <0.001). Although on the whole GPT-4.o outperformed Gemini-1.5-Flash, both models demonstrated an ability to comprehend the case scenarios and provided reasonable answers.}, number={2}, publisher={Bursa Uludağ Üniversitesi}