@article{article_1674529, title={Evaluating the Performance of Artificial Intelligence in MRI-Based LI-RADS v2018 Classification of Liver Lesions}, journal={Eskisehir Medical Journal}, volume={6}, pages={159–163}, year={2025}, author={Tanyeri, Ahmet and Akbulut, Rıdvan and Katmerlikaya, Aygün and Varol, Aral and Tuzcu, Göksel and Şahin, Tuna}, keywords={Yapay zekâ, manyetik rezonans görüntüleme, hepatosellüler karsinom, karaciğer görüntüleme raporlama ve veri sistemi, LI-RADS 2018, ChatGPT}, abstract={Introduction: This study aimed to evaluate the diagnostic performance of GPT-4V, a vision-enabled large language model, in classifying liver lesions on MRI according to LI-RADS v2018 criteria. Methods: Seventy contrast-enhanced liver MRI examinations were retrospectively selected, comprising 10 cases from each LI-RADS category. Each case was presented to GPT-4V as a standardised set of seven anonymised axial MRI slices, accompanied only by lesion size. The model was prompted to assign a single LI-RADS category based solely on visual input. The model’s performance was assessed using overall accuracy, Cohen’s kappa, ROC analysis, and correlation with lesion size. Results: GPT-4V achieved an overall classification accuracy of 37.1%. While the accuracy for LR-5 was high (90%), it was notably poor in LR-3 (0%) and LR-4 (20%). More than half of the lesions were misclassified as LR-5 (54.2%). Binary classification into benign (LR-1 and LR-2) versus malignant (LR-4, LR-5, LR-M, LR-TIV) yielded an accuracy of 84.3%, with an AUC of 0.72, sensitivity of 100%, and specificity of 45%. Cohen’s kappa values were 0.27 for detailed classification and 0.54 for benign–malignant grouping. Lesion size positively correlated with classification accuracy (ρ = 0.26, p = 0.031). The model demonstrated a tendency to favour high-certainty categories, often defaulting to LR-5 when diagnostic ambiguity was present. Conclusions: GPT-4V demonstrated limited performance in detailed LI-RADS classification, with a strong bias toward LR-5 and poor accuracy in intermediate categories. While the model demonstrated relatively better performance in distinguishing benign from malignant lesions, its current form remains inadequate for precise image-based categorisation. Further development with structured visual training and clinically contextual prompting is warranted.}, number={2}, publisher={Eskişehir Şehir Hastanesi}