@article {2754, title = {How Do Trait Change Patterns Affect the Performance of Adaptive Measurement of Change?}, journal = {Journal of Computerized Adaptive Testing}, volume = {10}, year = {2023}, pages = {32-58}, keywords = {adaptive measurement of change, computerized adaptive testing, longitudinal measurement, trait change patterns}, doi = {10.7333/2307-1003032}, author = {Ming Him Tai and Allison W. Cooperman and Joseph N. DeWeese and David J. Weiss} } @conference {2667, title = {Adapting Linear Models for Optimal Test Design to More Complex Test Specifications}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Combinatorial optimization (CO) has proven to be a very helpful approach for addressing test assembly issues and for providing solutions. Furthermore, CO has been applied for several test designs, including: (1) for the development of linear test forms; (2) for computerized adaptive testing and; (3) for multistage testing. In his seminal work, van der Linden (2006) laid out the basis for using linear models for simultaneously assembling exams and item pools in a variety of conditions: (1) for single tests and multiple tests; (2) with item sets, etc. However, for some testing programs, the number and complexity of test specifications can grow rapidly. Consequently, the mathematical representation of the test assembly problem goes beyond most approaches reported either in van der Linden\’s book or in the majority of other publications related to test assembly. In this presentation, we extend van der Linden\’s framework by including the concept of blocks for test specifications. We modify the usual mathematical notation of a test assembly problem by including this concept and we show how it can be applied to various test designs. Finally, we will demonstrate an implementation of this approach in a stand-alone software, called the ATASolver.

Session Video

}, keywords = {Complex Test Specifications, Linear Models, Optimal Test Design}, author = {Maxim Morin} } @conference {2641, title = {Is CAT Suitable for Automated Speaking Test?}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

We have developed automated scoring system of Japanese speaking proficiency, namely SJ-CAT (Speaking Japanese Computerized Adaptive Test), which is operational for last few months. One of the unique features of the test is an adaptive test base on polytomous IRT.

SJ-CAT consists of two sections; Section 1 has sentence reading aloud tasks and a multiple choicereading tasks and Section 2 has sentence generation tasks and an open answer tasks. In reading aloud tasks, a test taker reads a phoneme-balanced sentence on the screen after listening to a model reading. In a multiple choice-reading task, a test taker sees a picture and reads aloud one sentence among three sentences on the screen, which describe the scene most appropriately. In a sentence generation task, a test taker sees a picture or watches a video clip and describes the scene with his/her own words for about ten seconds. In an open answer tasks, the test taker expresses one\’s support for or opposition to e.g., a nuclear power generation with reasons for about 30 seconds.

In the course of the development of the test, we found many unexpected and unique characteristics of speaking CAT, which are not found in usual CATs with multiple choices. In this presentation, we will discuss some of such factors that are not previously noticed in our previous project of developing dichotomous J-CAT (Japanese Computerized Adaptive Test), which consists of vocabulary, grammar, reading, and listening. Firstly, we will claim that distribution of item difficulty parameters depends on the types of items. An item pool with unrestricted types of items such as open questions is difficult to achieve ideal distributions, either normal distribution or uniform distribution. Secondly, contrary to our expectations, open questions are not necessarily more difficult to operate in automated scoring system than more restricted questions such as sentence reading, as long as if one can set up suitable algorithm for open question scoring. Thirdly, we will show that the speed of convergence of standard deviation of posterior distribution, or standard error of theta parameter in polytomous IRT used for SJCAT is faster than dichotomous IRT used in J-CAT. Fourthly, we will discuss problems in equation of items in SJ-CAT, and suggest introducing deep learning with reinforcement learning instead of equation. And finally, we will discuss the issues of operation of SJ-CAT on the web, including speed of scoring, operation costs, security among others.

Session Video

}, keywords = {Automated Speaking Test, CAT, language testing}, author = {Shingo Imai} } @conference {2643, title = {A Large-Scale Progress Monitoring Application with Computerized Adaptive Testing}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Many conventional assessment tools are available to teachers in schools for monitoring student progress in a formative manner. The outcomes of these assessment tools are essential to teachers\’ instructional modifications and schools\’ data-driven educational strategies, such as using remedial activities and planning instructional interventions for students with learning difficulties. When measuring student progress toward instructional goals or outcomes, assessments should be not only considerably precise but also sensitive to individual change in learning. Unlike conventional paper-pencil assessments that are usually not appropriate for every student, computerized adaptive tests (CATs) are highly capable of estimating growth consistently with minimum and consistent error. Therefore, CATs can be used as a progress monitoring tool in measuring student growth.

This study focuses on an operational CAT assessment that has been used for measuring student growth in reading during the academic school year. The sample of this study consists of nearly 7 million students from the 1st grade to the 12th grade in the US. The students received a CAT-based reading assessment periodically during the school year. The purpose of these periodical assessments is to measure the growth in students\’ reading achievement and identify the students who may need additional instructional support (e.g., academic interventions). Using real data, this study aims to address the following research questions: (1) How many CAT administrations are necessary to make psychometrically sound decisions about the need for instructional changes in the classroom or when to provide academic interventions?; (2) What is the ideal amount of time between CAT administrations to capture student growth for the purpose of producing meaningful decisions from assessment results?

To address these research questions, we first used the Theil-Sen estimator for robustly fitting a regression line to each student\’s test scores obtained from a series of CAT administrations. Next, we\ used the conditional standard error of measurement (cSEM) from the CAT administrations to create an error band around the Theil-Sen slope (i.e., student growth rate). This process resulted in the normative slope values across all the grade levels. The optimal number of CAT administrations was established from grade-level regression results. The amount of time needed for progress monitoring was determined by calculating the amount of time required for a student to show growth beyond the median cSEM value for each grade level. The results showed that the normative slope values were the highest for lower grades and declined steadily as grade level increased. The results also suggested that the CAT-based reading assessment is most useful for grades 1 through 4, since most struggling readers requiring an intervention appear to be within this grade range. Because CAT yielded very similar cSEM values across administrations, the amount of error in the progress monitoring decisions did not seem to depend on the number of CAT administrations.

Session Video

}, keywords = {CAT, Large-Scale tests, Process monitoring}, url = {https://drive.google.com/open?id=1uGbCKenRLnqTxImX1fZicR2c7GRV6Udc}, author = {Okan Bulut and Damien Cormier} } @article {2529, title = {Latent-Class-Based Item Selection for Computerized Adaptive Progress Tests}, journal = {Journal of Computerized Adaptive Testing}, volume = {5}, year = {2017}, pages = {22-43}, keywords = {computerized adaptive progress test, item selection method, Kullback-Leibler information, Latent class analysis, log-odds scoring}, issn = {2165-6592}, doi = {10.7333/1704-0502022}, url = {http://iacat.org/jcat/index.php/jcat/article/view/62/29}, author = {van Buuren, Nikky and Eggen, Theo J. H. M.} } @conference {2660, title = {MHK-MST Design and the Related Simulation Study}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

The MHK is a national standardized exam that tests and rates Chinese language proficiency. It assesses non-native Chinese minorities\’ abilities in using the Chinese language in their daily, academic and professional lives; Computerized multistage adaptive testing (MST) is a combination of conventional paper-and-pencil (P\&P) and item level computerized adaptive test (CAT), it is a kind of test forms based on computerized technology, take the item set as the scoring unit. It can be said that, MST estimate the Ability extreme value more accurate than conventional paper-and-pencil (P\&P), also used the CAT auto-adapted characteristic to reduce the examination length and the score time of report. At present, MST has used in some large test, like Uniform CPA Examination and Graduate Record Examination(GRE). Therefore, it is necessary to develop the MST of application in China.

Based on consideration of the MHK characteristics and its future development, the researchers start with design of MHK-MST. This simulation study is conducted to validate the performance of the MHK -MST system. Real difficulty parameters of MHK items and the simulated ability parameters of the candidates are used to generate the original score matrix and the item modules are delivered to the candidates following the adaptive procedures set according to the path rules. This simulation study provides a sound basis for the implementation of MHK-MST.

Session Video

}, keywords = {language testing, MHK, multistage testing}, author = {Ling Yuyu and Zhou Chenglin and Ren Jie} } @conference {2628, title = {Using Automated Item Generation in a Large-scale Medical Licensure Exam Program: Lessons Learned.}, booktitle = {2017 IACAT Conference}, year = {2017}, month = {08.2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

On-demand testing has become commonplace with most large-scale testing programs. Continuous testing is appealing for candidates in that it affords greater flexibility in scheduling a session at the desired location. Furthermore, the push for more comprehensive systems of assessment (e.g. CBAL) is predicated on the availability of more frequently administered tasks given the purposeful link between instruction and assessment in these frameworks. However, continuous testing models impose several challenges to programs, including overexposure of items. Robust item banks are therefore needed to support routine retirement and replenishment of items. In a traditional approach to developing items, content experts select a topic and then develop an item consisting of a stem, lead-in question, a correct answer and list of distractors. The item then undergoes review by a panel of experts to validate the content and identify any potential flaws. The process involved in developing quality MCQ items can be time-consuming as well as costly, with estimates as high as $1500-$2500 USD per item (Rudner, 2010). The Medical Council of Canada (MCC) has been exploring a novel item development process to supplement traditional approaches. Specifically, the use of automated item generation (AIG), which uses technology to generate test items from cognitive models, has been studied for over five years. Cognitive models are representations of the knowledge and skills that are required to solve any given problem. While developing a cognitive model for a medical scenario, for example, content experts are asked to deconstruct the (clinical) reasoning process involved via clearly stated variables and related elements. The latter information is then entered into a computer program that uses algorithms to generate MCQs. The MCC has been piloting AIG \–based items for over five years with the MCC Qualifying Examination Part I (MCCQE I), a pre-requisite for licensure in Canada. The aim of this presentation is to provide an overview of the practical lessons learned in the use and operational rollout of AIG with the MCCQE I. Psychometrically, the quality of the items is at least equal, and in many instances superior, to that of traditionally written MCQs, based on difficulty, discrimination, and information. In fact, 96\% of the AIG based items piloted in a recent administration were retained for future operational scoring based on pre-defined inclusion criteria. AIG also offers a framework for the systematic creation of plausible distractors, in that the content experts not only need to provide the clinical reasoning underlying a correct response but also the cognitive errors associated with each of the distractors (Lai et al. 2016). Consequently, AIG holds great promise in regard to improving and tailoring diagnostic feedback for remedial purposes (Pugh, De Champlain, Gierl, Lai, Touchie, 2016). Furthermore, our test development process has been greatly enhanced by the addition of AIG as it requires that item writers use metacognitive skills to describe how they solve problems. We are hopeful that sharing our experiences with attendees might not only help other testing organizations interested in adopting AIG, but also foster discussion which might benefit all participants.

References

Lai, H., Gierl, M.J., Touchie, C., Pugh, D., Boulais, A.P., \& De Champlain, A.F. (2016). Using automatic item generation to improve the quality of MCQ distractors. Teaching and Learning in Medicine, 28, 166-173.

Pugh, D., De Champlain, A.F., Lai, H., Gierl, M., \& Touchie, C. (2016). Using cognitive models to develop quality multiple choice questions. Medical Teacher, 38, 838-843.

Rudner, L. (2010). Implementing the Graduate Management Admission Test Computerized Adaptive Test. In W. van der Linden \& C. Glass (Eds.), Elements of adaptive testing (pp. 151-165). New York, NY: Springer.\ 

Presentation Video

}, keywords = {Automated item generation, large scale, medical licensure}, url = {https://drive.google.com/open?id=14N8hUc8qexAy5W_94TykEDABGVIJHG1h}, author = {Andr{\'e} F. De Champlain} } @conference {2099, title = {From Reliability to Validity: Expanding Adaptive Testing Practice to Find the Most Valid Score for Each Test Taker}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

CAT is an exception to the traditional conception of validity. It is one of the few examples of individualized testing. Item difficulty is tailored to each examinee. The intent, however, is increased efficiency. Focus on reliability (reduced standard error); Equivalence with paper \& pencil tests is valued; Validity is enhanced through improved reliability.

How Else Might We Individualize Testing Using CAT?

An ISV-Based View of Validity

Test Event -- An examinee encounters a series of items in a particular context.

CAT Goal: individualize testing to address CIV threats to score validity (i.e., maximize ISV).

Some Research Issues:

}, keywords = {CAT, CIV, construct-irrelevant variance, Individual Score Validity, ISV, low test taking motivation, Reliability, validity}, author = {Steven L. Wise} } @conference {2081, title = {Practitioner{\textquoteright}s Approach to Identify Item Drift in CAT}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, keywords = {CUSUM method, G2 statistic, IPA, item drift, item parameter drift, Lord{\textquoteright}s chi-square statistic, Raju{\textquoteright}s NCDIF}, author = {Huijuan Meng and Susan Steinkamp and Paul Jones and Joy Matthews-Lopez} } @article {2, title = {Deterioro de par{\'a}metros de los {\'\i}tems en tests adaptativos informatizados: estudio con eCAT [Item parameter drift in computerized adaptive testing: Study with eCAT]}, journal = {Psicothema}, volume = {22}, number = {2}, year = {2010}, note = {Abad, Francisco JOlea, JulioAguado, DavidPonsoda, VicenteBarrada, Juan REnglish AbstractSpainPsicothemaPsicothema. 2010 May;22(2):340-7.}, pages = {340-7}, edition = {2010/04/29}, abstract = {

En el presente trabajo se muestra el an\álisis realizado sobre un Test Adaptativo Informatizado (TAI) dise\ñado para la evaluaci\ón del nivel de ingl\és, denominado eCAT, con el objetivo de estudiar el deterioro de par\ámetros (parameter drift) producido desde la calibraci\ón inicial del banco de \ítems. Se ha comparado la calibraci\ón original desarrollada para la puesta en servicio del TAI (N= 3224) y la calibraci\ón actual obtenida con las aplicaciones reales del TAI (N= 7254). Se ha analizado el Funcionamiento Diferencial de los \Ítems (FDI) en funci\ón de los par\ámetros utilizados y se ha simulado el impacto que sobre el nivel de rasgo estimado tiene la variaci\ón en los par\ámetros. Los resultados muestran que se produce especialmente un deterioro de los par\ámetros a y c, que hay unimportante n\úmero de \ítems del banco para los que existe FDI y que la variaci\ón de los par\ámetros produce un impacto moderado en la estimaci\ón de \θ de los evaluados con nivel de ingl\és alto. Se concluye que los par\ámetros de los \ítems se han deteriorado y deben ser actualizados.Item parameter drift in computerized adaptive testing: Study with eCAT. This study describes the parameter drift analysis conducted on eCAT (a Computerized Adaptive Test to assess the written English level of Spanish speakers). The original calibration of the item bank (N = 3224) was compared to a new calibration obtained from the data provided by most eCAT operative administrations (N =7254). A Differential Item Functioning (DIF) study was conducted between the original and the new calibrations. The impact that the new parameters have on the trait level estimates was obtained by simulation. Results show that parameter drift is found especially for a and c parameters, an important number of bank items show DIF, and the parameter change has a moderate impact on high-level-English \θ estimates. It is then recommended to replace the original estimates by the new set. by the new set.

}, keywords = {*Software, Educational Measurement/*methods/*statistics \& numerical data, Humans, Language}, isbn = {0214-9915 (Print)0214-9915 (Linking)}, author = {Abad, F. J. and Olea, J. and Aguado, D. and Ponsoda, V. and Barrada, J} } @article {2071, title = {Item Selection and Hypothesis Testing for the Adaptive Measurement of Change}, journal = {Applied Psychological Measurement}, volume = {34}, year = {2010}, pages = {238-254}, abstract = {

Assessing individual change is an important topic in both psychological and educational measurement. An adaptive measurement of change (AMC) method had previously been shown to exhibit greater efficiency in detecting change than conventional nonadaptive methods. However, little work had been done to compare different procedures within the AMC framework. This study introduced a new item selection criterion and two new test statistics for detecting change with AMC that were specifically designed for the paradigm of hypothesis testing. In two simulation sets, the new methods for detecting significant change improved on existing procedures by demonstrating better adherence to Type I error rates and substantially better power for detecting relatively small change.\ 

}, keywords = {change, computerized adaptive testing, individual change, Kullback{\textendash}Leibler information, likelihood ratio, measuring change}, doi = {10.1177/0146621609344844}, author = {Finkelman, M. D. and Weiss, D. J. and Kim-Kang, G.} } @article {112, title = {A mixed integer programming model for multiple stage adaptive testing}, journal = {European Journal of Operational Research}, volume = {193}, number = {2}, year = {2009}, note = {doi: DOI: 10.1016/j.ejor.2007.10.047}, pages = {342-350}, abstract = {The last decade has seen paper-and-pencil (P\&P) tests being replaced by computerized adaptive tests (CATs) within many testing programs. A CAT may yield several advantages relative to a conventional P\&P test. A CAT can determine the questions or test items to administer, allowing each test form to be tailored to a test taker{\textquoteright}s skill level. Subsequent items can be chosen to match the capability of the test taker. By adapting to a test taker{\textquoteright}s ability, a CAT can acquire more information about a test taker while administering fewer items. A Multiple Stage Adaptive test (MST) provides a means to implement a CAT that allows review before the administration. The MST format is a hybrid between the conventional P\&P and CAT formats. This paper presents mixed integer programming models for MST assembly problems. Computational results with commercial optimization software will be given and advantages of the models evaluated.}, keywords = {Education, Integer programming, Linear programming}, isbn = {0377-2217}, author = {Edmonds, J. and Armstrong, R. D.} } @article {169, title = {Computerized adaptive testing for follow-up after discharge from inpatient rehabilitation: II. Participation outcomes}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {89}, number = {2}, year = {2008}, note = {Haley, Stephen MGandek, BarbaraSiebens, HilaryBlack-Schaffer, Randie MSinclair, Samuel JTao, WeiCoster, Wendy JNi, PengshengJette, Alan MK02 HD045354-01A1/HD/NICHD NIH HHS/United StatesK02 HD45354-01/HD/NICHD NIH HHS/United StatesR01 HD043568/HD/NICHD NIH HHS/United StatesR01 HD043568-01/HD/NICHD NIH HHS/United StatesResearch Support, N.I.H., ExtramuralUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2008 Feb;89(2):275-83.}, month = {Feb}, pages = {275-283}, edition = {2008/01/30}, abstract = {OBJECTIVES: To measure participation outcomes with a computerized adaptive test (CAT) and compare CAT and traditional fixed-length surveys in terms of score agreement, respondent burden, discriminant validity, and responsiveness. DESIGN: Longitudinal, prospective cohort study of patients interviewed approximately 2 weeks after discharge from inpatient rehabilitation and 3 months later. SETTING: Follow-up interviews conducted in patient{\textquoteright}s home setting. PARTICIPANTS: Adults (N=94) with diagnoses of neurologic, orthopedic, or medically complex conditions. INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Participation domains of mobility, domestic life, and community, social, \& civic life, measured using a CAT version of the Participation Measure for Postacute Care (PM-PAC-CAT) and a 53-item fixed-length survey (PM-PAC-53). RESULTS: The PM-PAC-CAT showed substantial agreement with PM-PAC-53 scores (intraclass correlation coefficient, model 3,1, .71-.81). On average, the PM-PAC-CAT was completed in 42\% of the time and with only 48\% of the items as compared with the PM-PAC-53. Both formats discriminated across functional severity groups. The PM-PAC-CAT had modest reductions in sensitivity and responsiveness to patient-reported change over a 3-month interval as compared with the PM-PAC-53. CONCLUSIONS: Although continued evaluation is warranted, accurate estimates of participation status and responsiveness to change for group-level analyses can be obtained from CAT administrations, with a sizeable reduction in respondent burden.}, keywords = {*Activities of Daily Living, *Adaptation, Physiological, *Computer Systems, *Questionnaires, Adult, Aged, Aged, 80 and over, Chi-Square Distribution, Factor Analysis, Statistical, Female, Humans, Longitudinal Studies, Male, Middle Aged, Outcome Assessment (Health Care)/*methods, Patient Discharge, Prospective Studies, Rehabilitation/*standards, Subacute Care/*standards}, isbn = {1532-821X (Electronic)0003-9993 (Linking)}, author = {Haley, S. M. and Gandek, B. and Siebens, H. and Black-Schaffer, R. M. and Sinclair, S. J. and Tao, W. and Coster, W. J. and Ni, P. and Jette, A. M.} } @article {2103, title = {Computerized Adaptive Testing of Personality Traits}, journal = {Zeitschrift f{\"u}r Psychologie / Journal of Psychology}, volume = {216}, year = {2008}, pages = {12-21}, abstract = {

A computerized adaptive testing (CAT) procedure was simulated with ordinal polytomous personality data collected using a
conventional paper-and-pencil testing format. An adapted Dutch version of the dominance scale of Gough and Heilbrun\’s Adjective
Check List (ACL) was used. This version contained Likert response scales with five categories. Item parameters were estimated using Samejima\’s graded response model from the responses of 1,925 subjects. The CAT procedure was simulated using the responses of 1,517 other subjects. The value of the required standard error in the stopping rule of the CAT was manipulated. The relationship between CAT latent trait estimates and estimates based on all dominance items was studied. Additionally, the pattern of relationships between the CAT latent trait estimates and the other ACL scales was compared to that between latent trait estimates based on the entire item pool and the other ACL scales. The CAT procedure resulted in latent trait estimates qualitatively equivalent to latent trait estimates based on all items, while a substantial reduction of the number of used items could be realized (at the stopping rule of 0.4 about 33\% of the 36 items was used).

}, keywords = {Adaptive Testing, cmoputer-assisted testing, Item Response Theory, Likert scales, Personality Measures}, doi = {10.1027/0044-3409.216.1.12}, author = {Hol, A. M. and Vorst, H. C. M. and Mellenbergh, G. J.} } @article {293, title = {The NAPLEX: evolution, purpose, scope, and educational implications}, journal = {American Journal of Pharmaceutical Education}, volume = {72}, number = {2}, year = {2008}, note = {Newton, David WBoyle, MariaCatizone, Carmen AHistorical ArticleUnited StatesAmerican journal of pharmaceutical educationAm J Pharm Educ. 2008 Apr 15;72(2):33.}, month = {Apr 15}, pages = {33}, edition = {2008/05/17}, abstract = {Since 2004, passing the North American Pharmacist Licensure Examination (NAPLEX) has been a requirement for earning initial pharmacy licensure in all 50 United States. The creation and evolution from 1952-2005 of the particular pharmacy competency testing areas and quantities of questions are described for the former paper-and-pencil National Association of Boards of Pharmacy Licensure Examination (NABPLEX) and the current candidate-specific computer adaptive NAPLEX pharmacy licensure examinations. A 40\% increase in the weighting of NAPLEX Blueprint Area 2 in May 2005, compared to that in the preceding 1997-2005 Blueprint, has implications for candidates{\textquoteright} NAPLEX performance and associated curricular content and instruction. New pharmacy graduates{\textquoteright} scores on the NAPLEX are neither intended nor validated to serve as a criterion for assessing or judging the quality or effectiveness of pharmacy curricula and instruction. The newest cycle of NAPLEX Blueprint revision, a continual process to ensure representation of nationwide contemporary practice, began in early 2008. It may take up to 2 years, including surveying several thousand national pharmacists, to complete.}, keywords = {*Educational Measurement, Education, Pharmacy/*standards, History, 20th Century, History, 21st Century, Humans, Licensure, Pharmacy/history/*legislation \& jurisprudence, North America, Pharmacists/*legislation \& jurisprudence, Software}, isbn = {1553-6467 (Electronic)0002-9459 (Linking)}, author = {Newton, D. W. and Boyle, M. and Catizone, C. A.} } @article {401, title = {Assembling a computerized adaptive testing item pool as a set of linear tests}, journal = {Journal of Educational and Behavioral Statistics}, volume = {31}, number = {1}, year = {2006}, pages = {81-99}, publisher = {Sage Publications: US}, abstract = {Test-item writing efforts typically results in item pools with an undesirable correlational structure between the content attributes of the items and their statistical information. If such pools are used in computerized adaptive testing (CAT), the algorithm may be forced to select items with less than optimal information, that violate the content constraints, and/or have unfavorable exposure rates. Although at first sight somewhat counterintuitive, it is shown that if the CAT pool is assembled as a set of linear test forms, undesirable correlations can be broken down effectively. It is proposed to assemble such pools using a mixed integer programming model with constraints that guarantee that each test meets all content specifications and an objective function that requires them to have maximal information at a well-chosen set of ability values. An empirical example with a previous master pool from the Law School Admission Test (LSAT) yielded a CAT with nearly uniform bias and mean-squared error functions for the ability estimator and item-exposure rates that satisfied the target for all items in the pool. }, keywords = {Algorithms, computerized adaptive testing, item pool, linear tests, mathematical models, statistics, Test Construction, Test Items}, isbn = {1076-9986 (Print)}, author = {van der Linden, W. J. and Ariel, A. and Veldkamp, B. P.} } @article {247, title = {Comparing methods of assessing differential item functioning in a computerized adaptive testing environment}, journal = {Journal of Educational Measurement}, volume = {43}, number = {3}, year = {2006}, pages = {245-264}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {Mantel-Haenszel and SIBTEST, which have known difficulty in detecting non-unidirectional differential item functioning (DIF), have been adapted with some success for computerized adaptive testing (CAT). This study adapts logistic regression (LR) and the item-response-theory-likelihood-ratio test (IRT-LRT), capable of detecting both unidirectional and non-unidirectional DIF, to the CAT environment in which pretest items are assumed to be seeded in CATs but not used for trait estimation. The proposed adaptation methods were evaluated with simulated data under different sample size ratios and impact conditions in terms of Type I error, power, and specificity in identifying the form of DIF. The adapted LR and IRT-LRT procedures are more powerful than the CAT version of SIBTEST for non-unidirectional DIF detection. The good Type I error control provided by IRT-LRT under extremely unequal sample sizes and large impact is encouraging. Implications of these and other findings are discussed. all rights reserved)}, keywords = {computerized adaptive testing, educational testing, item response theory likelihood ratio test, logistic regression, trait estimation, unidirectional \& non-unidirectional differential item functioning}, isbn = {0022-0655 (Print)}, author = {Lei, P-W. and Chen, S-Y. and Yu, L.} } @article {176, title = {Computerized adaptive testing for follow-up after discharge from inpatient rehabilitation: I. Activity outcomes}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {87}, number = {8}, year = {2006}, note = {Haley, Stephen MSiebens, HilaryCoster, Wendy JTao, WeiBlack-Schaffer, Randie MGandek, BarbaraSinclair, Samuel JNi, PengshengK0245354-01/phsR01 hd043568/hd/nichdResearch Support, N.I.H., ExtramuralUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2006 Aug;87(8):1033-42.}, month = {Aug}, pages = {1033-42}, edition = {2006/08/01}, abstract = {OBJECTIVE: To examine score agreement, precision, validity, efficiency, and responsiveness of a computerized adaptive testing (CAT) version of the Activity Measure for Post-Acute Care (AM-PAC-CAT) in a prospective, 3-month follow-up sample of inpatient rehabilitation patients recently discharged home. DESIGN: Longitudinal, prospective 1-group cohort study of patients followed approximately 2 weeks after hospital discharge and then 3 months after the initial home visit. SETTING: Follow-up visits conducted in patients{\textquoteright} home setting. PARTICIPANTS: Ninety-four adults who were recently discharged from inpatient rehabilitation, with diagnoses of neurologic, orthopedic, and medically complex conditions. INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Summary scores from AM-PAC-CAT, including 3 activity domains of movement and physical, personal care and instrumental, and applied cognition were compared with scores from a traditional fixed-length version of the AM-PAC with 66 items (AM-PAC-66). RESULTS: AM-PAC-CAT scores were in good agreement (intraclass correlation coefficient model 3,1 range, .77-.86) with scores from the AM-PAC-66. On average, the CAT programs required 43\% of the time and 33\% of the items compared with the AM-PAC-66. Both formats discriminated across functional severity groups. The standardized response mean (SRM) was greater for the movement and physical fixed form than the CAT; the effect size and SRM of the 2 other AM-PAC domains showed similar sensitivity between CAT and fixed formats. Using patients{\textquoteright} own report as an anchor-based measure of change, the CAT and fixed length formats were comparable in responsiveness to patient-reported change over a 3-month interval. CONCLUSIONS: Accurate estimates for functional activity group-level changes can be obtained from CAT administrations, with a considerable reduction in administration time.}, keywords = {*Activities of Daily Living, *Adaptation, Physiological, *Computer Systems, *Questionnaires, Adult, Aged, Aged, 80 and over, Chi-Square Distribution, Factor Analysis, Statistical, Female, Humans, Longitudinal Studies, Male, Middle Aged, Outcome Assessment (Health Care)/*methods, Patient Discharge, Prospective Studies, Rehabilitation/*standards, Subacute Care/*standards}, isbn = {0003-9993 (Print)}, author = {Haley, S. M. and Siebens, H. and Coster, W. J. and Tao, W. and Black-Schaffer, R. M. and Gandek, B. and Sinclair, S. J. and Ni, P.} } @article {352, title = {Computerized adaptive testing of diabetes impact: a feasibility study of Hispanics and non-Hispanics in an active clinic population}, journal = {Quality of Life Research}, volume = {15}, number = {9}, year = {2006}, note = {Schwartz, CarolynWelch, GarrySantiago-Kelley, PaulaBode, RitaSun, Xiaowu1 r43 dk066874-01/dk/niddkResearch Support, N.I.H., ExtramuralNetherlandsQuality of life research : an international journal of quality of life aspects of treatment, care and rehabilitationQual Life Res. 2006 Nov;15(9):1503-18. Epub 2006 Sep 26.}, month = {Nov}, pages = {1503-18}, edition = {2006/10/13}, abstract = {BACKGROUND: Diabetes is a leading cause of death and disability in the US and is twice as common among Hispanic Americans as non-Hispanics. The societal costs of diabetes provide an impetus for developing tools that can improve patient care and delay or prevent diabetes complications. METHODS: We implemented a feasibility study of a Computerized Adaptive Test (CAT) to measure diabetes impact using a sample of 103 English- and 97 Spanish-speaking patients (mean age = 56.5, 66.5\% female) in a community medical center with a high proportion of minority patients (28\% African-American). The 37 items of the Diabetes Impact Survey were translated using forward-backward translation and cognitive debriefing. Participants were randomized to receive either the full-length tool or the Diabetes-CAT first, in the patient{\textquoteright}s native language. RESULTS: The number of items and the amount of time to complete the survey for the CAT was reduced to one-sixth the amount for the full-length tool in both languages, across disease severity. Confirmatory Factor Analysis confirmed that the Diabetes Impact Survey is unidimensional. The Diabetes-CAT demonstrated acceptable internal consistency reliability, construct validity, and discriminant validity in the overall sample, although subgroup analyses suggested that the English sample data evidenced higher levels of reliability and validity than the Spanish sample and issues with discriminant validity in the Spanish sample. Differential Item Function analysis revealed differences in responses tendencies by language group in 3 of the 37 items. Participant interviews suggested that the Spanish-speaking patients generally preferred the paper survey to the computer-assisted tool, and were twice as likely to experience difficulties understanding the items. CONCLUSIONS: While the Diabetes-CAT demonstrated clear advantages in reducing respondent burden as compared to the full-length tool, simplifying the item bank will be necessary for enhancing the feasibility of the Diabetes-CAT for use with low literacy patients.}, keywords = {*Computers, *Hispanic Americans, *Quality of Life, Adult, Aged, Data Collection/*methods, Diabetes Mellitus/*psychology, Feasibility Studies, Female, Humans, Language, Male, Middle Aged}, isbn = {0962-9343 (Print)}, author = {Schwartz, C. and Welch, G. and Santiago-Kelley, P. and Bode, R. and Sun, X.} } @article {399, title = {Equating scores from adaptive to linear tests}, journal = {Applied Psychological Measurement}, volume = {30}, number = {6}, year = {2006}, pages = {493-508}, publisher = {Sage Publications: US}, abstract = {Two local methods for observed-score equating are applied to the problem of equating an adaptive test to a linear test. In an empirical study, the methods were evaluated against a method based on the test characteristic function (TCF) of the linear test and traditional equipercentile equating applied to the ability estimates on the adaptive test for a population of test takers. The two local methods were generally best. Surprisingly, the TCF method performed slightly worse than the equipercentile method. Both methods showed strong bias and uniformly large inaccuracy, but the TCF method suffered from extra error due to the lower asymptote of the test characteristic function. It is argued that the worse performances of the two methods are a consequence of the fact that they use a single equating transformation for an entire population of test takers and therefore have to compromise between the individual score distributions. }, keywords = {computerized adaptive testing, equipercentile equating, local equating, score reporting, test characteristic function}, isbn = {0146-6216 (Print)}, author = {van der Linden, W. J.} } @article {16, title = {Maximum information stratification method for controlling item exposure in computerized adaptive testing}, journal = {Psicothema}, volume = {18}, number = {1}, year = {2006}, note = {Barrada, Juan RamonMazuela, PalomaOlea, JulioResearch Support, Non-U.S. Gov{\textquoteright}tSpainPsicothemaPsicothema. 2006 Feb;18(1):156-9.}, month = {Feb}, pages = {156-159}, edition = {2007/02/14}, abstract = {The proposal for increasing the security in Computerized Adaptive Tests that has received most attention in recent years is the a-stratified method (AS - Chang and Ying, 1999): at the beginning of the test only items with low discrimination parameters (a) can be administered, with the values of the a parameters increasing as the test goes on. With this method, distribution of the exposure rates of the items is less skewed, while efficiency is maintained in trait-level estimation. The pseudo-guessing parameter (c), present in the three-parameter logistic model, is considered irrelevant, and is not used in the AS method. The Maximum Information Stratified (MIS) model incorporates the c parameter in the stratification of the bank and in the item-selection rule, improving accuracy by comparison with the AS, for item banks with a and b parameters correlated and uncorrelated. For both kinds of banks, the blocking b methods (Chang, Qian and Ying, 2001) improve the security of the item bank.M{\'e}todo de estratificaci{\'o}n por m{\'a}xima informaci{\'o}n para el control de la exposici{\'o}n en tests adaptativos informatizados. La propuesta para aumentar la seguridad en los tests adaptativos informatizados que ha recibido m{\'a}s atenci{\'o}n en los {\'u}ltimos a{\~n}os ha sido el m{\'e}todo a-estratificado (AE - Chang y Ying, 1999): en los momentos iniciales del test s{\'o}lo pueden administrarse {\'\i}tems con bajos par{\'a}metros de discriminaci{\'o}n (a), increment{\'a}ndose los valores del par{\'a}metro a admisibles seg{\'u}n avanza el test. Con este m{\'e}todo la distribuci{\'o}n de las tasas de exposici{\'o}n de los {\'\i}tems es m{\'a}s equilibrada, manteniendo una adecuada precisi{\'o}n en la medida. El par{\'a}metro de pseudoadivinaci{\'o}n (c), presente en el modelo log{\'\i}stico de tres par{\'a}metros, se supone irrelevante y no se incorpora en el AE. El m{\'e}todo de Estratificaci{\'o}n por M{\'a}xima Informaci{\'o}n (EMI) incorpora el par{\'a}metro c a la estratificaci{\'o}n del banco y a la regla de selecci{\'o}n de {\'\i}tems, mejorando la precisi{\'o}n en comparaci{\'o}n con AE, tanto para bancos donde los par{\'a}metros a y b correlacionan como para bancos donde no. Para ambos tipos de bancos, los m{\'e}todos de bloqueo de b (Chang, Qian y Ying, 2001) mejoran la seguridad del banco.}, keywords = {*Artificial Intelligence, *Microcomputers, *Psychological Tests, *Software Design, Algorithms, Chi-Square Distribution, Humans, Likelihood Functions}, isbn = {0214-9915 (Print)}, author = {Barrada, J and Mazuela, P. and Olea, J.} } @article {2073, title = {Simulated computerized adaptive test for patients with lumbar spine impairments was efficient and produced valid measures of function}, journal = {Journal of Clinical Epidemiology}, volume = {59}, year = {2006}, pages = {947{\textendash}956}, abstract = {Objective: To equate physical functioning (PF) items with Back Pain Functional Scale (BPFS) items, develop a computerized adaptive test (CAT) designed to assess lumbar spine functional status (LFS) in people with lumbar spine impairments, and compare discriminant validity of LFS measures (qIRT) generated using all items analyzed with a rating scale Item Response Theory model (RSM) and measures generated using the simulated CAT (qCAT). Methods: We performed a secondary analysis of retrospective intake rehabilitation data. Results: Unidimensionality and local independence of 25 BPFS and PF items were supported. Differential item functioning was negligible for levels of symptom acuity, gender, age, and surgical history. The RSM fit the data well. A lumbar spine specific CAT was developed that was 72\% more efficient than using all 25 items to estimate LFS measures. qIRT and qCAT measures did not discriminate patients by symptom acuity, age, or gender, but discriminated patients by surgical history in similar clinically logical ways. qCAT measures were as precise as qIRT measures. Conclusion: A body part specific simulated CAT developed from an LFS item bank was efficient and produced precise measures of LFS without eroding discriminant validity.}, keywords = {Back Pain Functional Scale, computerized adaptive testing, Item Response Theory, Lumbar spine, Rehabilitation, True-score equating}, doi = {10.1016/j.jclinepi.2005.10.017}, author = {Hart, D. L. and Mioduski, J. E. and Werneke, M. W. and Stratford, P. W.} } @article {150, title = {Computer adaptive testing}, journal = {Journal of Applied Measurement}, volume = {6}, number = {1}, year = {2005}, note = {Gershon, Richard CReviewUnited StatesJournal of applied measurementJ Appl Meas. 2005;6(1):109-27.}, pages = {109-27}, edition = {2005/02/11}, abstract = {The creation of item response theory (IRT) and Rasch models, inexpensive accessibility to high speed desktop computers, and the growth of the Internet, has led to the creation and growth of computerized adaptive testing or CAT. This form of assessment is applicable for both high stakes tests such as certification or licensure exams, as well as health related quality of life surveys. This article discusses the historical background of CAT including its many advantages over conventional (typically paper and pencil) alternatives. The process of CAT is then described including descriptions of the specific differences of using CAT based upon 1-, 2- and 3-parameter IRT and various Rasch models. Numerous specific topics describing CAT in practice are described including: initial item selection, content balancing, test difficulty, test length and stopping rules. The article concludes with the author{\textquoteright}s reflections regarding the future of CAT.}, keywords = {*Internet, *Models, Statistical, *User-Computer Interface, Certification, Health Surveys, Humans, Licensure, Microcomputers, Quality of Life}, isbn = {1529-7713 (Print)}, author = {Gershon, R. C.} } @article {185, title = {Simulated computerized adaptive tests for measuring functional status were efficient with good discriminant validity in patients with hip, knee, or foot/ankle impairments}, journal = {Journal of Clinical Epidemiology}, volume = {58}, number = {6}, year = {2005}, note = {0895-4356 (Print)Journal ArticleMulticenter StudyValidation Studies}, pages = {629-38}, abstract = {BACKGROUND AND OBJECTIVE: To develop computerized adaptive tests (CATs) designed to assess lower extremity functional status (FS) in people with lower extremity impairments using items from the Lower Extremity Functional Scale and compare discriminant validity of FS measures generated using all items analyzed with a rating scale Item Response Theory model (theta(IRT)) and measures generated using the simulated CATs (theta(CAT)). METHODS: Secondary analysis of retrospective intake rehabilitation data. RESULTS: Unidimensionality of items was strong, and local independence of items was adequate. Differential item functioning (DIF) affected item calibration related to body part, that is, hip, knee, or foot/ankle, but DIF did not affect item calibration for symptom acuity, gender, age, or surgical history. Therefore, patients were separated into three body part specific groups. The rating scale model fit all three data sets well. Three body part specific CATs were developed: each was 70\% more efficient than using all LEFS items to estimate FS measures. theta(IRT) and theta(CAT) measures discriminated patients by symptom acuity, age, and surgical history in similar ways. theta(CAT) measures were as precise as theta(IRT) measures. CONCLUSION: Body part-specific simulated CATs were efficient and produced precise measures of FS with good discriminant validity.}, keywords = {*Health Status Indicators, Activities of Daily Living, Adolescent, Adult, Aged, Aged, 80 and over, Ankle Joint/physiopathology, Diagnosis, Computer-Assisted/*methods, Female, Hip Joint/physiopathology, Humans, Joint Diseases/physiopathology/*rehabilitation, Knee Joint/physiopathology, Lower Extremity/*physiopathology, Male, Middle Aged, Research Support, N.I.H., Extramural, Research Support, U.S. Gov{\textquoteright}t, P.H.S., Retrospective Studies}, author = {Hart, D. L. and Mioduski, J. E. and Stratford, P. W.} } @booklet {201, title = {The AMC Linear Disability Score project in a population requiring residential care: psychometric properties}, journal = {Health and Quality of Life Outcomes}, volume = {2}, year = {2004}, note = {Holman, RebeccaLindeboom, RobertVermeulen, Marinusde Haan, Rob JResearch Support, Non-U.S. Gov{\textquoteright}tValidation StudiesEnglandHealth and quality of life outcomesHealth Qual Life Outcomes. 2004 Aug 3;2:42.}, month = {Aug 3}, pages = {42}, edition = {2004/08/05}, abstract = {BACKGROUND: Currently there is a lot of interest in the flexible framework offered by item banks for measuring patient relevant outcomes, including functional status. However, there are few item banks, which have been developed to quantify functional status, as expressed by the ability to perform activities of daily life. METHOD: This paper examines the psychometric properties of the AMC Linear Disability Score (ALDS) project item bank using an item response theory model and full information factor analysis. Data were collected from 555 respondents on a total of 160 items. RESULTS: Following the analysis, 79 items remained in the item bank. The remaining 81 items were excluded because of: difficulties in presentation (1 item); low levels of variation in response pattern (28 items); significant differences in measurement characteristics for males and females or for respondents under or over 85 years old (26 items); or lack of model fit to the data at item level (26 items). CONCLUSIONS: It is conceivable that the item bank will have different measurement characteristics for other patient or demographic populations. However, these results indicate that the ALDS item bank has sound psychometric properties for respondents in residential care settings and could form a stable base for measuring functional status in a range of situations, including the implementation of computerised adaptive testing of functional status.}, keywords = {*Disability Evaluation, *Health Status Indicators, Activities of Daily Living/*classification, Adult, Aged, Aged, 80 and over, Data Collection/methods, Female, Humans, Logistic Models, Male, Middle Aged, Netherlands, Pilot Projects, Probability, Psychometrics/*instrumentation, Questionnaires/standards, Residential Facilities/*utilization, Severity of Illness Index}, isbn = {1477-7525 (Electronic)1477-7525 (Linking)}, author = {Holman, R. and Lindeboom, R. and Vermeulen, M. and de Haan, R. J.} } @article {11, title = {Computerized adaptive testing with multiple-form structures}, journal = {Applied Psychological Measurement}, volume = {28}, number = {3}, year = {2004}, pages = {147-164}, publisher = {Sage Publications: US}, abstract = {A multiple-form structure (MFS) is an ordered collection or network of testlets (i.e., sets of items). An examinee{\textquoteright}s progression through the network of testlets is dictated by the correctness of an examinee{\textquoteright}s answers, thereby adapting the test to his or her trait level. The collection of paths through the network yields the set of all possible test forms, allowing test specialists the opportunity to review them before they are administered. Also, limiting the exposure of an individual MFS to a specific period of time can enhance test security. This article provides an overview of methods that have been developed to generate parallel MFSs. The approach is applied to the assembly of an experimental computerized Law School Admission Test (LSAT). (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive testing, Law School Admission Test, multiple-form structure, testlets}, isbn = {0146-6216 (Print)}, author = {Armstrong, R. D. and Jones, D. H. and Koppel, N. B. and Pashley, P. J.} } @article {30, title = {Calibration of an item pool for assessing the burden of headaches: an application of item response theory to the Headache Impact Test (HIT)}, journal = {Quality of Life Research}, volume = {12}, number = {8}, year = {2003}, note = {0962-9343Journal Article}, pages = {913-933}, abstract = {BACKGROUND: Measurement of headache impact is important in clinical trials, case detection, and the clinical monitoring of patients. Computerized adaptive testing (CAT) of headache impact has potential advantages over traditional fixed-length tests in terms of precision, relevance, real-time quality control and flexibility. OBJECTIVE: To develop an item pool that can be used for a computerized adaptive test of headache impact. METHODS: We analyzed responses to four well-known tests of headache impact from a population-based sample of recent headache sufferers (n = 1016). We used confirmatory factor analysis for categorical data and analyses based on item response theory (IRT). RESULTS: In factor analyses, we found very high correlations between the factors hypothesized by the original test constructers, both within and between the original questionnaires. These results suggest that a single score of headache impact is sufficient. We established a pool of 47 items which fitted the generalized partial credit IRT model. By simulating a computerized adaptive health test we showed that an adaptive test of only five items had a very high concordance with the score based on all items and that different worst-case item selection scenarios did not lead to bias. CONCLUSION: We have established a headache impact item pool that can be used in CAT of headache impact.}, keywords = {*Cost of Illness, *Decision Support Techniques, *Sickness Impact Profile, Adolescent, Adult, Aged, Comparative Study, Disability Evaluation, Factor Analysis, Statistical, Headache/*psychology, Health Surveys, Human, Longitudinal Studies, Middle Aged, Migraine/psychology, Models, Psychological, Psychometrics/*methods, Quality of Life/*psychology, Software, Support, Non-U.S. Gov{\textquoteright}t}, author = {Bjorner, J. B. and Kosinski, M. and Ware, J. E., Jr.} } @article {48, title = {Advances in quality of life measurements in oncology patients}, journal = {Seminars in Oncology}, volume = {29}, number = {3 Suppl 8}, year = {2002}, note = {0093-7754 (Print)Journal ArticleReview}, month = {Jun}, pages = {60-8}, abstract = {Accurate assessment of the quality of life (QOL) of patients can provide important clinical information to physicians, especially in the area of oncology. Changes in QOL are important indicators of the impact of a new cytotoxic therapy, can affect a patient{\textquoteright}s willingness to continue treatment, and may aid in defining response in the absence of quantifiable endpoints such as tumor regression. Because QOL is becoming an increasingly important aspect in the management of patients with malignant disease, it is vital that the instruments used to measure QOL are reliable and accurate. Assessment of QOL involves a multidimensional approach that includes physical, functional, social, and emotional well-being, and the most comprehensive instruments measure at least three of these domains. Instruments to measure QOL can be generic (eg, the Nottingham Health Profile), targeted toward specific illnesses (eg, Functional Assessment of Cancer Therapy - Lung), or be a combination of generic and targeted. Two of the most widely used examples of the combination, or hybrid, instruments are the European Organization for Research and Treatment of Cancer Quality of Life Questionnaire Core 30 Items and the Functional Assessment of Chronic Illness Therapy. A consequence of the increasing international collaboration in clinical trials has been the growing necessity for instruments that are valid across languages and cultures. To assure the continuing reliability and validity of QOL instruments in this regard, item response theory can be applied. Techniques such as item response theory may be used in the future to construct QOL item banks containing large sets of validated questions that represent various levels of QOL domains. As QOL becomes increasingly important in understanding and approaching the overall management of cancer patients, the tools available to clinicians and researchers to assess QOL will continue to evolve. While the instruments currently available provide reliable and valid measurement, further improvements in precision and application are anticipated.}, keywords = {*Quality of Life, *Sickness Impact Profile, Cross-Cultural Comparison, Culture, Humans, Language, Neoplasms/*physiopathology, Questionnaires}, author = {Cella, D. and Chang, C-H. and Lai, J. S. and Webster, K.} } @article {12, title = {Information technology and literacy assessment}, journal = {Reading and Writing Quarterly}, volume = {18}, number = {4}, year = {2002}, pages = {369-373}, abstract = {This column discusses information technology and literacy assessment in the past and present. The author also describes computer-based assessments today including the following topics: computer-scored testing, computer-administered formal assessment, Internet formal assessment, computerized adaptive tests, placement tests, informal assessment, electronic portfolios, information management, and Internet information dissemination. A model of the major present-day applications of information technologies in reading and literacy assessment is also included. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Computer Applications, Computer Assisted Testing, Information, Internet, Literacy, Models, Systems, Technology}, author = {Balajthy, E.} } @article {146, title = {Multidimensional adaptive testing for mental health problems in primary care}, journal = {Medical Care}, volume = {40}, number = {9}, year = {2002}, note = {Gardner, WilliamKelleher, Kelly JPajer, Kathleen AMCJ-177022/PHS HHS/MH30915/MH/NIMH NIH HHS/MH50629/MH/NIMH NIH HHS/Med Care. 2002 Sep;40(9):812-23.}, month = {Sep}, pages = {812-23}, edition = {2002/09/10}, abstract = {OBJECTIVES: Efficient and accurate instruments for assessing child psychopathology are increasingly important in clinical practice and research. For example, screening in primary care settings can identify children and adolescents with disorders that may otherwise go undetected. However, primary care offices are notorious for the brevity of visits and screening must not burden patients or staff with long questionnaires. One solution is to shorten assessment instruments, but dropping questions typically makes an instrument less accurate. An alternative is adaptive testing, in which a computer selects the items to be asked of a patient based on the patient{\textquoteright}s previous responses. This research used a simulation to test a child mental health screen based on this technology. RESEARCH DESIGN: Using half of a large sample of data, a computerized version was developed of the Pediatric Symptom Checklist (PSC), a parental-report psychosocial problem screen. With the unused data, a simulation was conducted to determine whether the Adaptive PSC can reproduce the results of the full PSC with greater efficiency. SUBJECTS: PSCs were completed by parents on 21,150 children seen in a national sample of primary care practices. RESULTS: Four latent psychosocial problem dimensions were identified through factor analysis: internalizing problems, externalizing problems, attention problems, and school problems. A simulated adaptive test measuring these traits asked an average of 11.6 questions per patient, and asked five or fewer questions for 49\% of the sample. There was high agreement between the adaptive test and the full (35-item) PSC: only 1.3\% of screening decisions were discordant (kappa = 0.93). This agreement was higher than that obtained using a comparable length (12-item) short-form PSC (3.2\% of decisions discordant; kappa = 0.84). CONCLUSIONS: Multidimensional adaptive testing may be an accurate and efficient technology for screening for mental health problems in primary care settings.}, keywords = {Adolescent, Child, Child Behavior Disorders/*diagnosis, Child Health Services/*organization \& administration, Factor Analysis, Statistical, Female, Humans, Linear Models, Male, Mass Screening/*methods, Parents, Primary Health Care/*organization \& administration}, isbn = {0025-7079 (Print)0025-7079 (Linking)}, author = {Gardner, W. and Kelleher, K. J. and Pajer, K. A.} } @article {234, title = {Evaluating the usefulness of computerized adaptive testing for medical in-course assessment}, journal = {Academic Medicine}, volume = {74}, number = {10}, year = {1999}, note = {Kreiter, C DFerguson, KGruppen, L DUnited statesAcademic medicine : journal of the Association of American Medical CollegesAcad Med. 1999 Oct;74(10):1125-8.}, month = {Oct}, pages = {1125-8}, edition = {1999/10/28}, abstract = {PURPOSE: This study investigated the feasibility of converting an existing computer-administered, in-course internal medicine test to an adaptive format. METHOD: A 200-item internal medicine extended matching test was used for this research. Parameters were estimated with commercially available software with responses from 621 examinees. A specially developed simulation program was used to retrospectively estimate the efficiency of the computer-adaptive exam format. RESULTS: It was found that the average test length could be shortened by almost half with measurement precision approximately equal to that of the full 200-item paper-and-pencil test. However, computer-adaptive testing with this item bank provided little advantage for examinees at the upper end of the ability continuum. An examination of classical item statistics and IRT item statistics suggested that adding more difficult items might extend the advantage to this group of examinees. CONCLUSIONS: Medical item banks presently used for incourse assessment might be advantageously employed in adaptive testing. However, it is important to evaluate the match between the items and the measurement objective of the test before implementing this format.}, keywords = {*Automation, *Education, Medical, Undergraduate, Educational Measurement/*methods, Humans, Internal Medicine/*education, Likelihood Functions, Psychometrics/*methods, Reproducibility of Results}, isbn = {1040-2446 (Print)}, author = {Kreiter, C. D. and Ferguson, K. and Gruppen, L. D.} } @article {217, title = {Moving in a new direction: Computerized adaptive testing (CAT)}, journal = {Nursing Management}, volume = {24}, number = {1}, year = {1993}, note = {Jones-Dickson, CDorsey, DCampbell-Warnock, JFields, FUnited statesNursing managementNurs Manage. 1993 Jan;24(1):80, 82.}, month = {Jan}, pages = {80, 82}, edition = {1993/01/01}, keywords = {*Computers, Accreditation/methods, Educational Measurement/*methods, Licensure, Nursing, United States}, isbn = {0744-6314 (Print)}, author = {Jones-Dickson, C. and Dorsey, D. and Campbell-Warnock, J. and Fields, F.} }