@article{article_1674995, title={ChatGPT vs. DeepSeek: A comparative psychometric evaluation of AI tools in generating multiple-choice questions}, journal={International Journal of Assessment Tools in Education}, volume={12}, pages={1055–1079}, year={2025}, DOI={10.21449/ijate.1674995}, author={Gündeğer Kılcı, Ceylan}, keywords={ChatGPT, DeepSeek, Item generation, Psychometrics, Generalizability theory}, abstract={This study examined the psychometric quality of multiple-choice questions generated by two AI tools, ChatGPT and DeepSeek, within the context of an undergraduate Educational Measurement and Evaluation course. Guided by ten learning outcomes (LOs) aligned with Bloom’s Taxonomy, each tool was prompted to generate one five-option multiple-choice item per LO. Following expert review (Kendall’s W = .58); revisions were made, and the finalized test was administered to 120 students. Item analyses revealed no statistically significant differences between the two AI models regarding item difficulty, discrimination, variance, or reliability. A few items -two from ChatGPT and one from DeepSeek- had suboptimal discrimination indices. Tetrachoric correlation analyses of item pairs generated by the two AI tools for the same LO revealed that only one pair showed a non-significant association, whereas all other pairs demonstrated statistically significant and generally moderate correlations. KR-20 and split-half reliability coefficients reflected acceptable internal consistency for a classroom-based assessment, with the DeepSeek-generated half showing a slightly stronger correlation with total scores. Expert feedback indicated that while AI tools generally produced valid stems and correct answers, most revisions focused on improving distractor quality, highlighting the need for human refinement. Generalizability and Decision studies confirmed consistency in expert ratings and recommended a minimum of seven experts for reliable evaluations. In conclusion, both AI tools demonstrated the capacity to generate psychometrically comparable items, highlighting their potential to support educators and test developers in test construction. The study concludes with practical recommendations for effectively incorporating AI into test development workflows.}, number={4}, publisher={İzzet KARA}