@article{article_1730282, title={Benchmarking Artificial Intelligence Models for Clinical Guidance in Nocturia and Nocturnal Polyuria: A Comparative Evaluation of ChatGPT, Gemini, Copilot, and Perplexity}, journal={The New Journal of Urology}, volume={20}, pages={183–192}, year={2025}, DOI={10.33719/nju1730282}, author={Çeker, Gökhan and Ulus, İsmail and Hacıbey, İbrahim}, keywords={yapay zeka, büyük dil modelleri, noktüri, nokturnal poliüri}, abstract={Objective: This study aimed to evaluate and compare the performance of four artificial intelligence (AI) models—ChatGPT-4.0, Gemini 1.5 Pro, Copilot, and Perplexity Pro—in answering clinical questions about nocturia and nocturnal polyuria.
Material and Methods: A total of 25 standardized clinical questions were developed across five thematic domains: general understanding, etiology and pathophysiology, diagnostic work-up, management strategies, and special populations. Responses from each AI model were scored by two blinded expert urologists using a five-point Likert scale across five quality domains: relevance, clarity, structure, utility, and factual accuracy. Mean scores were compared using repeated measures ANOVA or Friedman tests depending on data distribution. Inter-rater reliability was measured via the intraclass correlation coefficient (ICC).
Results: ChatGPT-4.0 and Perplexity Pro achieved the highest overall mean scores (4.61/5 and 4.52/5), significantly outperforming Gemini (4.35/5) and Copilot (3.63/5) (p = 0.032). ChatGPT scored highest in “general understanding” (4.86/5, p = 0.018), while Perplexity led in “management strategies” (4.74/5, p = 0.021). Copilot consistently scored lowest, particularly in “diagnostic work-up” (3.42/5, p = 0.008). In quality domain analysis, ChatGPT and Perplexity again outperformed others, especially in “factual accuracy” (4.48/5 and 4.44/5), with Copilot trailing (3.54/5, p = 0.001). Inter-rater reliability was excellent (ICC = 0.91).
Conclusion: ChatGPT and Perplexity Pro demonstrated strong performance in delivering clinically relevant and accurate information on nocturia and nocturnal polyuria. These findings suggest their potential as supportive tools for education and decision-making. Copilot’s lower performance underscores the need for continued model refinement. AI integration in clinical contexts should remain guided by expert validation and alignment with current urological guidelines.}, number={3}, publisher={Ali İhsan TAŞÇI}, organization={There was no institutional, commercial, or personal financial funding received for this research.}