@article {2750, title = {Improving Precision of CAT Measures}, journal = {Journal of Computerized Adaptive Test-ing}, volume = {9}, year = {2022}, month = {10/2022}, pages = {1-7}, keywords = {: dichotomously scored items, option probability theory, scoring methods, subjective probability}, issn = {2165-6592}, doi = {10.7333/2210-0901001}, author = {John J. Barnard} } @conference {2667, title = {Adapting Linear Models for Optimal Test Design to More Complex Test Specifications}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Combinatorial optimization (CO) has proven to be a very helpful approach for addressing test assembly issues and for providing solutions. Furthermore, CO has been applied for several test designs, including: (1) for the development of linear test forms; (2) for computerized adaptive testing and; (3) for multistage testing. In his seminal work, van der Linden (2006) laid out the basis for using linear models for simultaneously assembling exams and item pools in a variety of conditions: (1) for single tests and multiple tests; (2) with item sets, etc. However, for some testing programs, the number and complexity of test specifications can grow rapidly. Consequently, the mathematical representation of the test assembly problem goes beyond most approaches reported either in van der Linden\’s book or in the majority of other publications related to test assembly. In this presentation, we extend van der Linden\’s framework by including the concept of blocks for test specifications. We modify the usual mathematical notation of a test assembly problem by including this concept and we show how it can be applied to various test designs. Finally, we will demonstrate an implementation of this approach in a stand-alone software, called the ATASolver.

}, keywords = {Complex Test Specifications, Linear Models, Optimal Test Design}, author = {Maxim Morin} } @conference {2654, title = {Concerto 5 Open Source CAT Platform: From Code to Nodes}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Concerto 5 is the newest version of the Concerto open source R-based Computer-Adaptive Testing platform, which is currently used in educational testing and in clinical trials. In our quest to make CAT accessible to all, the latest version uses flowchart nodes to connect different elements of a test, so that CAT test creation is an intuitive high-level process that does not require writing code.

A test creator might connect an Info Page node, to a Consent Page node, to a CAT node, to a Feedback node. And after uploading their items, their test is done.

This talk will show the new flowchart interface, and demonstrate the creation of a CAT test from scratch in less than 10 minutes.

Concerto 5 also includes a new Polytomous CAT node, so CATs with Likert items can be easily created in the flowchart interface. This node is currently used in depression and anxiety tests in a clinical trial.

Session Video

}, keywords = {Concerto 5, Open Source CAT}, url = {https://drive.google.com/open?id=11eu1KKILQEoK5c-CYO1P1AiJgiQxX0E0}, author = {David Stillwell} } @conference {2650, title = {Item Parameter Drifting and Online Calibration}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Item calibration is a part of the most important topics in item response theory (IRT). Since many largescale testing programs have switched from paper and pencil (P\&P) testing mode to computerized adaptive testing (CAT) mode, developing methods for efficiently calibrating new items have become vital. Among many proposed item calibration processes in CAT, online calibration is the most cost-effective. This presentation introduces an online (re)calibration design to detect item parameter drift for computerized adaptive testing (CAT) in both unidimensional and multidimensional environments. Specifically, for online calibration optimal design in unidimensional computerized adaptive testing model, a two-stage design is proposed by implementing a proportional density index algorithm. For a multidimensional computerized adaptive testing model, a four-quadrant online calibration pretest item selection design with proportional density index algorithm is proposed. Comparisons were made between different online calibration item selection strategies. Results showed that under unidimensional computerized adaptive testing, the proposed modified two-stage item selection criterion with the proportional density algorithm outperformed the other existing methods in terms of item parameter calibration and item parameter drift detection, and under multidimensional computerized adaptive testing, the online (re)calibration technique with the proposed four-quadrant item selection design with proportional density index outperformed other methods.

Session Video

}, keywords = {online calibration, Parameter Drift}, author = {Hua-Hua Chang and Rui Guo} } @conference {2630, title = {Scripted On-the-fly Multistage Testing}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

On-the-fly multistage testing (OMST) was introduced recently as a promising alternative to preassembled MST. A decidedly appealing feature of both is the reviewability of items within the current stage. However, the fundamental difference is that, instead of routing to a preassembled module, OMST adaptively assembles a module at each stage according to an interim ability estimate. This produces more individualized forms with finer measurement precision, but imposing nonstatistical constraints and controlling item exposure become more cumbersome. One recommendation is to use the maximum priority index followed by a remediation step to satisfy content constraints, and the Sympson-Hetter method with a stratified item bank for exposure control.

However, these methods can be computationally expensive, thereby impeding practical implementation. Therefore, this study investigated the script method as a simpler solution to the challenge of strict content balancing and effective item exposure control in OMST. The script method was originally devised as an item selection algorithm for CAT and generally proceeds as follows: For a test with m items, there are m slots to be filled, and an item is selected according to pre-defined rules for each slot. For the first slot, randomly select an item from a designated content area (collection). For each subsequent slot, 1) Discard any enemies of items already administered in previous slots; 2) Draw a designated number of candidate items (selection length) from the designated collection according to the current ability estimate; 3) Randomly select one item from the set of candidates. There are two distinct features of the script method. First, a predetermined sequence of collections guarantees meeting content specifications. The specific ordering may be determined either randomly or deliberately by content experts. Second, steps 2 and 3 depict a method of exposure control, in which selection length balances item usage at the possible expense of ability estimation accuracy. The adaptation of the script method to OMST is straightforward. For the first module, randomly select each item from a designated collection. For each subsequent module, the process is the same as in scripted CAT (SCAT) except the same ability estimate is used for the selection of all items within the module. A series of simulations was conducted to evaluate the performance of scripted OMST (SOMST, with 3 or 4 evenly divided stages) relative to SCAT under various item exposure restrictions. In all conditions, reliability was maximized by programming an optimization algorithm that searches for the smallest possible selection length for each slot within the constraints. Preliminary results indicated that SOMST is certainly a capable design with performance comparable to that of SCAT. The encouraging findings and ease of implementation highly motivate the prospect of operational use for large-scale assessments.

Presentation Video

}, keywords = {CAT, multistage testing, On-the-fly testing}, url = {https://drive.google.com/open?id=1wKuAstITLXo6BM4APf2mPsth1BymNl-y}, author = {Edison Choe and Bruce Williams and Sung-Hyuck Lee} } @article {2455, title = {Implementing a CAT: The AMC Experience }, journal = {Journal of Computerized Adaptive Testing}, volume = {3}, year = {2015}, pages = {1-12}, type = {Applications and Implementations}, keywords = {adaptive, Assessment, computer, medical, online, Testing}, issn = {2165-6592}, doi = {10.7333/15100301001}, url = {http://www.iacat.org/jcat/index.php/jcat/article/view/52/25}, author = {Barnard, John J} } @article {2305, title = {Comparison of two Bayesian methods to detect mode effects between paper-based and computerized adaptive assessments: a preliminary Monte Carlo study.}, journal = {BMC Med Res Methodol}, volume = {12}, year = {2012}, month = {2012}, pages = {124}, abstract = {

BACKGROUND: Computerized adaptive testing (CAT) is being applied to health outcome measures developed as paper-and-pencil (P\&P) instruments. Differences in how respondents answer items administered by CAT vs. P\&P can increase error in CAT-estimated measures if not identified and corrected.

METHOD: Two methods for detecting item-level mode effects are proposed using Bayesian estimation of posterior distributions of item parameters: (1) a modified robust Z (RZ) test, and (2) 95\% credible intervals (CrI) for the CAT-P\&P difference in item difficulty. A simulation study was conducted under the following conditions: (1) data-generating model (one- vs. two-parameter IRT model); (2) moderate vs. large DIF sizes; (3) percentage of DIF items (10\% vs. 30\%), and (4) mean difference in \θ estimates across modes of 0 vs. 1 logits. This resulted in a total of 16 conditions with 10 generated datasets per condition.

RESULTS: Both methods evidenced good to excellent false positive control, with RZ providing better control of false positives and with slightly higher power for CrI, irrespective of measurement model. False positives increased when items were very easy to endorse and when there with mode differences in mean trait level. True positives were predicted by CAT item usage, absolute item difficulty and item discrimination. RZ outperformed CrI, due to better control of false positive DIF.

CONCLUSIONS: Whereas false positives were well controlled, particularly for RZ, power to detect DIF was suboptimal. Research is needed to examine the robustness of these methods under varying prior assumptions concerning the distribution of item and person parameters and when data fail to conform to prior assumptions. False identification of DIF when items were very easy to endorse is a problem warranting additional investigation.

}, keywords = {Bayes Theorem, Data Interpretation, Statistical, Humans, Mathematical Computing, Monte Carlo Method, Outcome Assessment (Health Care)}, issn = {1471-2288}, doi = {10.1186/1471-2288-12-124}, author = {Riley, Barth B and Carle, Adam C} } @conference {2108, title = {Optimal Calibration Designs for Computerized Adaptive Testing}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

Optimaztion

How can we exploit the advantages of Balanced Block Design while keeping the logistics manageable?

Maximize number of item pairs
Subject to maximum number of test booklets
Subject to other constraints

Homogeneous Designs: Overlap between test booklets as regular as possible

Conclusions:

Establish overlaps as regular as possible between all test booklets
Or, at least as many test booklets as possible

}, keywords = {balanced block design, CAT, item calibration, optimization, Rasch}, author = {Angela Verschoor} } @article {7, title = {Development and preliminary testing of a computerized adaptive assessment of chronic pain}, journal = {Journal of Pain}, volume = {10}, number = {9}, year = {2009}, note = {Anatchkova, Milena DSaris-Baglama, Renee NKosinski, MarkBjorner, Jakob B1R43AR052251-01A1/AR/NIAMS NIH HHS/United StatesEvaluation StudiesResearch Support, N.I.H., ExtramuralUnited StatesThe journal of pain : official journal of the American Pain SocietyJ Pain. 2009 Sep;10(9):932-43.}, month = {Sep}, pages = {932-943}, edition = {2009/07/15}, abstract = {The aim of this article is to report the development and preliminary testing of a prototype computerized adaptive test of chronic pain (CHRONIC PAIN-CAT) conducted in 2 stages: (1) evaluation of various item selection and stopping rules through real data-simulated administrations of CHRONIC PAIN-CAT; (2) a feasibility study of the actual prototype CHRONIC PAIN-CAT assessment system conducted in a pilot sample. Item calibrations developed from a US general population sample (N = 782) were used to program a pain severity and impact item bank (kappa = 45), and real data simulations were conducted to determine a CAT stopping rule. The CHRONIC PAIN-CAT was programmed on a tablet PC using QualityMetric{\textquoteright}s Dynamic Health Assessment (DYHNA) software and administered to a clinical sample of pain sufferers (n = 100). The CAT was completed in significantly less time than the static (full item bank) assessment (P < .001). On average, 5.6 items were dynamically administered by CAT to achieve a precise score. Scores estimated from the 2 assessments were highly correlated (r = .89), and both assessments discriminated across pain severity levels (P < .001, RV = .95). Patients{\textquoteright} evaluations of the CHRONIC PAIN-CAT were favorable. PERSPECTIVE: This report demonstrates that the CHRONIC PAIN-CAT is feasible for administration in a clinic. The application has the potential to improve pain assessment and help clinicians manage chronic pain.}, keywords = {*Computers, *Questionnaires, Activities of Daily Living, Adaptation, Psychological, Chronic Disease, Cohort Studies, Disability Evaluation, Female, Humans, Male, Middle Aged, Models, Psychological, Outcome Assessment (Health Care), Pain Measurement/*methods, Pain, Intractable/*diagnosis/psychology, Psychometrics, Quality of Life, User-Computer Interface}, isbn = {1528-8447 (Electronic)1526-5900 (Linking)}, author = {Anatchkova, M. D. and Saris-Baglama, R. N. and Kosinski, M. and Bjorner, J. B.} } @article {138, title = {Development of an item bank for the assessment of depression in persons with mental illnesses and physical diseases using Rasch analysis}, journal = {Rehabilitation Psychology}, volume = {54}, number = {2}, year = {2009}, note = {Forkmann, ThomasBoecker, MarenNorra, ChristineEberle, NicoleKircher, TiloSchauerte, PatrickMischke, KarlWesthofen, MartinGauggel, SiegfriedWirtz, MarkusResearch Support, Non-U.S. Gov{\textquoteright}tUnited StatesRehabilitation psychologyRehabil Psychol. 2009 May;54(2):186-97.}, month = {May}, pages = {186-97}, edition = {2009/05/28}, abstract = {OBJECTIVE: The calibration of item banks provides the basis for computerized adaptive testing that ensures high diagnostic precision and minimizes participants{\textquoteright} test burden. The present study aimed at developing a new item bank that allows for assessing depression in persons with mental and persons with somatic diseases. METHOD: The sample consisted of 161 participants treated for a depressive syndrome, and 206 participants with somatic illnesses (103 cardiologic, 103 otorhinolaryngologic; overall mean age = 44.1 years, SD =14.0; 44.7\% women) to allow for validation of the item bank in both groups. Persons answered a pool of 182 depression items on a 5-point Likert scale. RESULTS: Evaluation of Rasch model fit (infit < 1.3), differential item functioning, dimensionality, local independence, item spread, item and person separation (>2.0), and reliability (>.80) resulted in a bank of 79 items with good psychometric properties. CONCLUSIONS: The bank provides items with a wide range of content coverage and may serve as a sound basis for computerized adaptive testing applications. It might also be useful for researchers who wish to develop new fixed-length scales for the assessment of depression in specific rehabilitation settings.}, keywords = {Adaptation, Psychological, Adult, Aged, Depressive Disorder/*diagnosis/psychology, Diagnosis, Computer-Assisted, Female, Heart Diseases/*psychology, Humans, Male, Mental Disorders/*psychology, Middle Aged, Models, Statistical, Otorhinolaryngologic Diseases/*psychology, Personality Assessment/statistics \& numerical data, Personality Inventory/*statistics \& numerical data, Psychometrics/statistics \& numerical data, Questionnaires, Reproducibility of Results, Sick Role}, isbn = {0090-5550 (Print)0090-5550 (Linking)}, author = {Forkmann, T. and Boecker, M. and Norra, C. and Eberle, N. and Kircher, T. and Schauerte, P. and Mischke, K. and Westhofen, M. and Gauggel, S. and Wirtz, M.} } @article {173, title = {Replenishing a computerized adaptive test of patient-reported daily activity functioning}, journal = {Quality of Life Research}, volume = {18}, number = {4}, year = {2009}, note = {Haley, Stephen MNi, PengshengJette, Alan MTao, WeiMoed, RichardMeyers, DougLudlow, Larry HK02 HD45354-01/HD/NICHD NIH HHS/United StatesResearch Support, N.I.H., ExtramuralNetherlandsQuality of life research : an international journal of quality of life aspects of treatment, care and rehabilitationQual Life Res. 2009 May;18(4):461-71. Epub 2009 Mar 14.}, month = {May}, pages = {461-71}, edition = {2009/03/17}, abstract = {PURPOSE: Computerized adaptive testing (CAT) item banks may need to be updated, but before new items can be added, they must be linked to the previous CAT. The purpose of this study was to evaluate 41 pretest items prior to including them into an operational CAT. METHODS: We recruited 6,882 patients with spine, lower extremity, upper extremity, and nonorthopedic impairments who received outpatient rehabilitation in one of 147 clinics across 13 states of the USA. Forty-one new Daily Activity (DA) items were administered along with the Activity Measure for Post-Acute Care Daily Activity CAT (DA-CAT-1) in five separate waves. We compared the scoring consistency with the full item bank, test information function (TIF), person standard errors (SEs), and content range of the DA-CAT-1 to the new CAT (DA-CAT-2) with the pretest items by real data simulations. RESULTS: We retained 29 of the 41 pretest items. Scores from the DA-CAT-2 were more consistent (ICC = 0.90 versus 0.96) than DA-CAT-1 when compared with the full item bank. TIF and person SEs were improved for persons with higher levels of DA functioning, and ceiling effects were reduced from 16.1\% to 6.1\%. CONCLUSIONS: Item response theory and online calibration methods were valuable in improving the DA-CAT.}, keywords = {*Activities of Daily Living, *Disability Evaluation, *Questionnaires, *User-Computer Interface, Adult, Aged, Cohort Studies, Computer-Assisted Instruction, Female, Humans, Male, Middle Aged, Outcome Assessment (Health Care)/*methods}, isbn = {0962-9343 (Print)0962-9343 (Linking)}, author = {Haley, S. M. and Ni, P. and Jette, A. M. and Tao, W. and Moed, R. and Meyers, D. and Ludlow, L. H.} } @article {88, title = {Assessing self-care and social function using a computer adaptive testing version of the pediatric evaluation of disability inventory}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {89}, number = {4}, year = {2008}, note = {Coster, Wendy JHaley, Stephen MNi, PengshengDumas, Helene MFragala-Pinkham, Maria AK02 HD45354-01A1/HD/NICHD NIH HHS/United StatesR41 HD052318-01A1/HD/NICHD NIH HHS/United StatesR43 HD42388-01/HD/NICHD NIH HHS/United StatesComparative StudyResearch Support, N.I.H., ExtramuralUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2008 Apr;89(4):622-9.}, month = {Apr}, pages = {622-629}, edition = {2008/04/01}, abstract = {OBJECTIVE: To examine score agreement, validity, precision, and response burden of a prototype computer adaptive testing (CAT) version of the self-care and social function scales of the Pediatric Evaluation of Disability Inventory compared with the full-length version of these scales. DESIGN: Computer simulation analysis of cross-sectional and longitudinal retrospective data; cross-sectional prospective study. SETTING: Pediatric rehabilitation hospital, including inpatient acute rehabilitation, day school program, outpatient clinics; community-based day care, preschool, and children{\textquoteright}s homes. PARTICIPANTS: Children with disabilities (n=469) and 412 children with no disabilities (analytic sample); 38 children with disabilities and 35 children without disabilities (cross-validation sample). INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Summary scores from prototype CAT applications of each scale using 15-, 10-, and 5-item stopping rules; scores from the full-length self-care and social function scales; time (in seconds) to complete assessments and respondent ratings of burden. RESULTS: Scores from both computer simulations and field administration of the prototype CATs were highly consistent with scores from full-length administration (r range, .94-.99). Using computer simulation of retrospective data, discriminant validity, and sensitivity to change of the CATs closely approximated that of the full-length scales, especially when the 15- and 10-item stopping rules were applied. In the cross-validation study the time to administer both CATs was 4 minutes, compared with over 16 minutes to complete the full-length scales. CONCLUSIONS: Self-care and social function score estimates from CAT administration are highly comparable with those obtained from full-length scale administration, with small losses in validity and precision and substantial decreases in administration time.}, keywords = {*Disability Evaluation, *Social Adjustment, Activities of Daily Living, Adolescent, Age Factors, Child, Child, Preschool, Computer Simulation, Cross-Over Studies, Disabled Children/*rehabilitation, Female, Follow-Up Studies, Humans, Infant, Male, Outcome Assessment (Health Care), Reference Values, Reproducibility of Results, Retrospective Studies, Risk Factors, Self Care/*standards/trends, Sex Factors, Sickness Impact Profile}, isbn = {1532-821X (Electronic)0003-9993 (Linking)}, author = {Coster, W. J. and Haley, S. M. and Ni, P. and Dumas, H. M. and Fragala-Pinkham, M. A.} } @article {169, title = {Computerized adaptive testing for follow-up after discharge from inpatient rehabilitation: II. Participation outcomes}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {89}, number = {2}, year = {2008}, note = {Haley, Stephen MGandek, BarbaraSiebens, HilaryBlack-Schaffer, Randie MSinclair, Samuel JTao, WeiCoster, Wendy JNi, PengshengJette, Alan MK02 HD045354-01A1/HD/NICHD NIH HHS/United StatesK02 HD45354-01/HD/NICHD NIH HHS/United StatesR01 HD043568/HD/NICHD NIH HHS/United StatesR01 HD043568-01/HD/NICHD NIH HHS/United StatesResearch Support, N.I.H., ExtramuralUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2008 Feb;89(2):275-83.}, month = {Feb}, pages = {275-283}, edition = {2008/01/30}, abstract = {OBJECTIVES: To measure participation outcomes with a computerized adaptive test (CAT) and compare CAT and traditional fixed-length surveys in terms of score agreement, respondent burden, discriminant validity, and responsiveness. DESIGN: Longitudinal, prospective cohort study of patients interviewed approximately 2 weeks after discharge from inpatient rehabilitation and 3 months later. SETTING: Follow-up interviews conducted in patient{\textquoteright}s home setting. PARTICIPANTS: Adults (N=94) with diagnoses of neurologic, orthopedic, or medically complex conditions. INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Participation domains of mobility, domestic life, and community, social, \& civic life, measured using a CAT version of the Participation Measure for Postacute Care (PM-PAC-CAT) and a 53-item fixed-length survey (PM-PAC-53). RESULTS: The PM-PAC-CAT showed substantial agreement with PM-PAC-53 scores (intraclass correlation coefficient, model 3,1, .71-.81). On average, the PM-PAC-CAT was completed in 42\% of the time and with only 48\% of the items as compared with the PM-PAC-53. Both formats discriminated across functional severity groups. The PM-PAC-CAT had modest reductions in sensitivity and responsiveness to patient-reported change over a 3-month interval as compared with the PM-PAC-53. CONCLUSIONS: Although continued evaluation is warranted, accurate estimates of participation status and responsiveness to change for group-level analyses can be obtained from CAT administrations, with a sizeable reduction in respondent burden.}, keywords = {*Activities of Daily Living, *Adaptation, Physiological, *Computer Systems, *Questionnaires, Adult, Aged, Aged, 80 and over, Chi-Square Distribution, Factor Analysis, Statistical, Female, Humans, Longitudinal Studies, Male, Middle Aged, Outcome Assessment (Health Care)/*methods, Patient Discharge, Prospective Studies, Rehabilitation/*standards, Subacute Care/*standards}, isbn = {1532-821X (Electronic)0003-9993 (Linking)}, author = {Haley, S. M. and Gandek, B. and Siebens, H. and Black-Schaffer, R. M. and Sinclair, S. J. and Tao, W. and Coster, W. J. and Ni, P. and Jette, A. M.} } @article {5, title = {Efficiency and sensitivity of multidimensional computerized adaptive testing of pediatric physical functioning}, journal = {Disability \& Rehabilitation}, volume = {30}, number = {6}, year = {2008}, note = {Allen, Diane DNi, PengshengHaley, Stephen MK02 HD45354-01/HD/NICHD NIH HHS/United StatesNIDDR H133P0001/DD/NCBDD CDC HHS/United StatesResearch Support, N.I.H., ExtramuralEnglandDisability and rehabilitationDisabil Rehabil. 2008;30(6):479-84.}, pages = {479-84}, edition = {2008/02/26}, abstract = {PURPOSE: Computerized adaptive tests (CATs) have efficiency advantages over fixed-length tests of physical functioning but may lose sensitivity when administering extremely low numbers of items. Multidimensional CATs may efficiently improve sensitivity by capitalizing on correlations between functional domains. Using a series of empirical simulations, we assessed the efficiency and sensitivity of multidimensional CATs compared to a longer fixed-length test. METHOD: Parent responses to the Pediatric Evaluation of Disability Inventory before and after intervention for 239 children at a pediatric rehabilitation hospital provided the data for this retrospective study. Reliability, effect size, and standardized response mean were compared between full-length self-care and mobility subscales and simulated multidimensional CATs with stopping rules at 40, 30, 20, and 10 items. RESULTS: Reliability was lowest in the 10-item CAT condition for the self-care (r = 0.85) and mobility (r = 0.79) subscales; all other conditions had high reliabilities (r > 0.94). All multidimensional CAT conditions had equivalent levels of sensitivity compared to the full set condition for both domains. CONCLUSIONS: Multidimensional CATs efficiently retain the sensitivity of longer fixed-length measures even with 5 items per dimension (10-item CAT condition). Measuring physical functioning with multidimensional CATs could enhance sensitivity following intervention while minimizing response burden.}, keywords = {*Disability Evaluation, Child, Computers, Disabled Children/*classification/rehabilitation, Efficiency, Humans, Outcome Assessment (Health Care), Psychometrics, Reproducibility of Results, Retrospective Studies, Self Care, Sensitivity and Specificity}, isbn = {0963-8288 (Print)0963-8288 (Linking)}, author = {Allen, D. D. and Ni, P. and Haley, S. M.} } @article {152, title = {Using computerized adaptive testing to reduce the burden of mental health assessment}, journal = {Psychiatric Services}, volume = {59}, number = {4}, year = {2008}, note = {Gibbons, Robert DWeiss, David JKupfer, David JFrank, EllenFagiolini, AndreaGrochocinski, Victoria JBhaumik, Dulal KStover, AngelaBock, R DarrellImmekus, Jason CR01-MH-30915/MH/United States NIMHR01-MH-66302/MH/United States NIMHResearch Support, N.I.H., ExtramuralUnited StatesPsychiatric services (Washington, D.C.)Psychiatr Serv. 2008 Apr;59(4):361-8.}, month = {Apr}, pages = {361-8}, edition = {2008/04/02}, abstract = {OBJECTIVE: This study investigated the combination of item response theory and computerized adaptive testing (CAT) for psychiatric measurement as a means of reducing the burden of research and clinical assessments. METHODS: Data were from 800 participants in outpatient treatment for a mood or anxiety disorder; they completed 616 items of the 626-item Mood and Anxiety Spectrum Scales (MASS) at two times. The first administration was used to design and evaluate a CAT version of the MASS by using post hoc simulation. The second confirmed the functioning of CAT in live testing. RESULTS: Tests of competing models based on item response theory supported the scale{\textquoteright}s bifactor structure, consisting of a primary dimension and four group factors (mood, panic-agoraphobia, obsessive-compulsive, and social phobia). Both simulated and live CAT showed a 95\% average reduction (585 items) in items administered (24 and 30 items, respectively) compared with administration of the full MASS. The correlation between scores on the full MASS and the CAT version was .93. For the mood disorder subscale, differences in scores between two groups of depressed patients--one with bipolar disorder and one without--on the full scale and on the CAT showed effect sizes of .63 (p<.003) and 1.19 (p<.001) standard deviation units, respectively, indicating better discriminant validity for CAT. CONCLUSIONS: Instead of using small fixed-length tests, clinicians can create item banks with a large item pool, and a small set of the items most relevant for a given individual can be administered with no loss of information, yielding a dramatic reduction in administration time and patient and clinician burden.}, keywords = {*Diagnosis, Computer-Assisted, *Questionnaires, Adolescent, Adult, Aged, Agoraphobia/diagnosis, Anxiety Disorders/diagnosis, Bipolar Disorder/diagnosis, Female, Humans, Male, Mental Disorders/*diagnosis, Middle Aged, Mood Disorders/diagnosis, Obsessive-Compulsive Disorder/diagnosis, Panic Disorder/diagnosis, Phobic Disorders/diagnosis, Reproducibility of Results, Time Factors}, isbn = {1075-2730 (Print)}, author = {Gibbons, R. D. and Weiss, D. J. and Kupfer, D. J. and Frank, E. and Fagiolini, A. and Grochocinski, V. J. and Bhaumik, D. K. and Stover, A. and Bock, R. D. and Immekus, J. C.} } @article {125, title = {The effect of including pretest items in an operational computerized adaptive test: Do different ability examinees spend different amounts of time on embedded pretest items?}, journal = {Educational Assessment}, volume = {12}, number = {2}, year = {2007}, pages = {161-173}, publisher = {Lawrence Erlbaum: US}, abstract = {The purpose of this study was to examine the effect of pretest items on response time in an operational, fixed-length, time-limited computerized adaptive test (CAT). These pretest items are embedded within the CAT, but unlike the operational items, are not tailored to the examinee{\textquoteright}s ability level. If examinees with higher ability levels need less time to complete these items than do their counterparts with lower ability levels, they will have more time to devote to the operational test questions. Data were from a graduate admissions test that was administered worldwide. Data from both quantitative and verbal sections of the test were considered. For the verbal section, examinees in the lower ability groups spent systematically more time on their pretest items than did those in the higher ability groups, though for the quantitative section the differences were less clear. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {ability, operational computerized adaptive test, pretest items, time}, isbn = {1062-7197 (Print); 1532-6977 (Electronic)}, author = {Ferdous, A. A. and Plake, B. S. and Chang, S-R.} } @article {52, title = {Improving patient reported outcomes using item response theory and computerized adaptive testing}, journal = {Journal of Rheumatology}, volume = {34}, number = {6}, year = {2007}, note = {Chakravarty, Eliza FBjorner, Jakob BFries, James FAr052158/ar/niamsConsensus Development ConferenceResearch Support, N.I.H., ExtramuralCanadaThe Journal of rheumatologyJ Rheumatol. 2007 Jun;34(6):1426-31.}, month = {Jun}, pages = {1426-31}, edition = {2007/06/07}, abstract = {OBJECTIVE: Patient reported outcomes (PRO) are considered central outcome measures for both clinical trials and observational studies in rheumatology. More sophisticated statistical models, including item response theory (IRT) and computerized adaptive testing (CAT), will enable critical evaluation and reconstruction of currently utilized PRO instruments to improve measurement precision while reducing item burden on the individual patient. METHODS: We developed a domain hierarchy encompassing the latent trait of physical function/disability from the more general to most specific. Items collected from 165 English-language instruments were evaluated by a structured process including trained raters, modified Delphi expert consensus, and then patient evaluation. Each item in the refined data bank will undergo extensive analysis using IRT to evaluate response functions and measurement precision. CAT will allow for real-time questionnaires of potentially smaller numbers of questions tailored directly to each individual{\textquoteright}s level of physical function. RESULTS: Physical function/disability domain comprises 4 subdomains: upper extremity, trunk, lower extremity, and complex activities. Expert and patient review led to consensus favoring use of present-tense "capability" questions using a 4- or 5-item Likert response construct over past-tense "performance"items. Floor and ceiling effects, attribution of disability, and standardization of response categories were also addressed. CONCLUSION: By applying statistical techniques of IRT through use of CAT, existing PRO instruments may be improved to reduce questionnaire burden on the individual patients while increasing measurement precision that may ultimately lead to reduced sample size requirements for costly clinical trials.}, keywords = {*Rheumatic Diseases/physiopathology/psychology, Clinical Trials, Data Interpretation, Statistical, Disability Evaluation, Health Surveys, Humans, International Cooperation, Outcome Assessment (Health Care)/*methods, Patient Participation/*methods, Research Design/*trends, Software}, isbn = {0315-162X (Print)}, author = {Chakravarty, E. F. and Bjorner, J. B. and Fries, J.F.} } @article {86, title = {IRT health outcomes data analysis project: an overview and summary}, journal = {Quality of Life Research}, volume = {16}, number = {Suppl. 1}, year = {2007}, note = {Cook, Karon FTeal, Cayla RBjorner, Jakob BCella, DavidChang, Chih-HungCrane, Paul KGibbons, Laura EHays, Ron DMcHorney, Colleen AOcepek-Welikson, KatjaRaczek, Anastasia ETeresi, Jeanne AReeve, Bryce B1U01AR52171-01/AR/United States NIAMSR01 (CA60068)/CA/United States NCIY1-PC-3028-01/PC/United States NCIResearch Support, N.I.H., ExtramuralNetherlandsQuality of life research : an international journal of quality of life aspects of treatment, care and rehabilitationQual Life Res. 2007;16 Suppl 1:121-32. Epub 2007 Mar 10.}, pages = {121-132}, edition = {2007/03/14}, abstract = {BACKGROUND: In June 2004, the National Cancer Institute and the Drug Information Association co-sponsored the conference, "Improving the Measurement of Health Outcomes through the Applications of Item Response Theory (IRT) Modeling: Exploration of Item Banks and Computer-Adaptive Assessment." A component of the conference was presentation of a psychometric and content analysis of a secondary dataset. OBJECTIVES: A thorough psychometric and content analysis was conducted of two primary domains within a cancer health-related quality of life (HRQOL) dataset. RESEARCH DESIGN: HRQOL scales were evaluated using factor analysis for categorical data, IRT modeling, and differential item functioning analyses. In addition, computerized adaptive administration of HRQOL item banks was simulated, and various IRT models were applied and compared. SUBJECTS: The original data were collected as part of the NCI-funded Quality of Life Evaluation in Oncology (Q-Score) Project. A total of 1,714 patients with cancer or HIV/AIDS were recruited from 5 clinical sites. MEASURES: Items from 4 HRQOL instruments were evaluated: Cancer Rehabilitation Evaluation System-Short Form, European Organization for Research and Treatment of Cancer Quality of Life Questionnaire, Functional Assessment of Cancer Therapy and Medical Outcomes Study Short-Form Health Survey. RESULTS AND CONCLUSIONS: Four lessons learned from the project are discussed: the importance of good developmental item banks, the ambiguity of model fit results, the limits of our knowledge regarding the practical implications of model misfit, and the importance in the measurement of HRQOL of construct definition. With respect to these lessons, areas for future research are suggested. The feasibility of developing item banks for broad definitions of health is discussed.}, keywords = {*Data Interpretation, Statistical, *Health Status, *Quality of Life, *Questionnaires, *Software, Female, HIV Infections/psychology, Humans, Male, Neoplasms/psychology, Outcome Assessment (Health Care)/*methods, Psychometrics, Stress, Psychological}, isbn = {0962-9343 (Print)}, author = {Cook, K. F. and Teal, C. R. and Bjorner, J. B. and Cella, D. and Chang, C-H. and Crane, P. K. and Gibbons, L. E. and Hays, R. D. and McHorney, C. A. and Ocepek-Welikson, K. and Raczek, A. E. and Teresi, J. A. and Reeve, B. B.} } @article {18, title = {Methods for restricting maximum exposure rate in computerized adaptative testing}, journal = {Methodology: European Journal of Research Methods for the Behavioral and Social Sciences}, volume = {3}, number = {1}, year = {2007}, pages = {14-23}, publisher = {Hogrefe \& Huber Publishers GmbH: Germany}, abstract = {The Sympson-Hetter (1985) method provides a means of controlling maximum exposure rate of items in Computerized Adaptive Testing. Through a series of simulations, control parameters are set that mark the probability of administration of an item on being selected. This method presents two main problems: it requires a long computation time for calculating the parameters and the maximum exposure rate is slightly above the fixed limit. Van der Linden (2003) presented two alternatives which appear to solve both of the problems. The impact of these methods in the measurement accuracy has not been tested yet. We show how these methods over-restrict the exposure of some highly discriminating items and, thus, the accuracy is decreased. It also shown that, when the desired maximum exposure rate is near the minimum possible value, these methods offer an empirical maximum exposure rate clearly above the goal. A new method, based on the initial estimation of the probability of administration and the probability of selection of the items with the restricted method (Revuelta \& Ponsoda, 1998), is presented in this paper. It can be used with the Sympson-Hetter method and with the two van der Linden{\textquoteright}s methods. This option, when used with Sympson-Hetter, speeds the convergence of the control parameters without decreasing the accuracy. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive testing, item bank security, item exposure control, overlap rate, Sympson-Hetter method}, isbn = {1614-1881 (Print); 1614-2241 (Electronic)}, author = {Barrada, J and Olea, J. and Ponsoda, V.} } @article {328, title = {Psychometric evaluation and calibration of health-related quality of life item banks: plans for the Patient-Reported Outcomes Measurement Information System (PROMIS)}, journal = {Medical Care}, volume = {45}, number = {5 Suppl 1}, year = {2007}, note = {Reeve, Bryce BHays, Ron DBjorner, Jakob BCook, Karon FCrane, Paul KTeresi, Jeanne AThissen, DavidRevicki, Dennis AWeiss, David JHambleton, Ronald KLiu, HonghuGershon, RichardReise, Steven PLai, Jin-sheiCella, DavidPROMIS Cooperative GroupAG015815/AG/United States NIAResearch Support, N.I.H., ExtramuralUnited StatesMedical careMed Care. 2007 May;45(5 Suppl 1):S22-31.}, month = {May}, pages = {S22-31}, edition = {2007/04/20}, abstract = {BACKGROUND: The construction and evaluation of item banks to measure unidimensional constructs of health-related quality of life (HRQOL) is a fundamental objective of the Patient-Reported Outcomes Measurement Information System (PROMIS) project. OBJECTIVES: Item banks will be used as the foundation for developing short-form instruments and enabling computerized adaptive testing. The PROMIS Steering Committee selected 5 HRQOL domains for initial focus: physical functioning, fatigue, pain, emotional distress, and social role participation. This report provides an overview of the methods used in the PROMIS item analyses and proposed calibration of item banks. ANALYSES: Analyses include evaluation of data quality (eg, logic and range checking, spread of response distribution within an item), descriptive statistics (eg, frequencies, means), item response theory model assumptions (unidimensionality, local independence, monotonicity), model fit, differential item functioning, and item calibration for banking. RECOMMENDATIONS: Summarized are key analytic issues; recommendations are provided for future evaluations of item banks in HRQOL assessment.}, keywords = {*Health Status, *Information Systems, *Quality of Life, *Self Disclosure, Adolescent, Adult, Aged, Calibration, Databases as Topic, Evaluation Studies as Topic, Female, Humans, Male, Middle Aged, Outcome Assessment (Health Care)/*methods, Psychometrics, Questionnaires/standards, United States}, isbn = {0025-7079 (Print)}, author = {Reeve, B. B. and Hays, R. D. and Bjorner, J. B. and Cook, K. F. and Crane, P. K. and Teresi, J. A. and Thissen, D. and Revicki, D. A. and Weiss, D. J. and Hambleton, R. K. and Liu, H. and Gershon, R. C. and Reise, S. P. and Lai, J. S. and Cella, D.} } @article {172, title = {Computer adaptive testing improved accuracy and precision of scores over random item selection in a physical functioning item bank}, journal = {Journal of Clinical Epidemiology}, volume = {59}, number = {11}, year = {2006}, note = {Haley, Stephen MNi, PengshengHambleton, Ronald KSlavin, Mary DJette, Alan MK02 hd45354-01/hd/nichdR01 hd043568/hd/nichdComparative StudyResearch Support, N.I.H., ExtramuralResearch Support, U.S. Gov{\textquoteright}t, Non-P.H.S.EnglandJournal of clinical epidemiologyJ Clin Epidemiol. 2006 Nov;59(11):1174-82. Epub 2006 Jul 11.}, month = {Nov}, pages = {1174-82}, edition = {2006/10/10}, abstract = {BACKGROUND AND OBJECTIVE: Measuring physical functioning (PF) within and across postacute settings is critical for monitoring outcomes of rehabilitation; however, most current instruments lack sufficient breadth and feasibility for widespread use. Computer adaptive testing (CAT), in which item selection is tailored to the individual patient, holds promise for reducing response burden, yet maintaining measurement precision. We calibrated a PF item bank via item response theory (IRT), administered items with a post hoc CAT design, and determined whether CAT would improve accuracy and precision of score estimates over random item selection. METHODS: 1,041 adults were interviewed during postacute care rehabilitation episodes in either hospital or community settings. Responses for 124 PF items were calibrated using IRT methods to create a PF item bank. We examined the accuracy and precision of CAT-based scores compared to a random selection of items. RESULTS: CAT-based scores had higher correlations with the IRT-criterion scores, especially with short tests, and resulted in narrower confidence intervals than scores based on a random selection of items; gains, as expected, were especially large for low and high performing adults. CONCLUSION: The CAT design may have important precision and efficiency advantages for point-of-care functional assessment in rehabilitation practice settings.}, keywords = {*Recovery of Function, Activities of Daily Living, Adolescent, Adult, Aged, Aged, 80 and over, Confidence Intervals, Factor Analysis, Statistical, Female, Humans, Male, Middle Aged, Outcome Assessment (Health Care)/*methods, Rehabilitation/*standards, Reproducibility of Results, Software}, isbn = {0895-4356 (Print)}, author = {Haley, S. M. and Ni, P. and Hambleton, R. K. and Slavin, M. D. and Jette, A. M.} } @article {176, title = {Computerized adaptive testing for follow-up after discharge from inpatient rehabilitation: I. Activity outcomes}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {87}, number = {8}, year = {2006}, note = {Haley, Stephen MSiebens, HilaryCoster, Wendy JTao, WeiBlack-Schaffer, Randie MGandek, BarbaraSinclair, Samuel JNi, PengshengK0245354-01/phsR01 hd043568/hd/nichdResearch Support, N.I.H., ExtramuralUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2006 Aug;87(8):1033-42.}, month = {Aug}, pages = {1033-42}, edition = {2006/08/01}, abstract = {OBJECTIVE: To examine score agreement, precision, validity, efficiency, and responsiveness of a computerized adaptive testing (CAT) version of the Activity Measure for Post-Acute Care (AM-PAC-CAT) in a prospective, 3-month follow-up sample of inpatient rehabilitation patients recently discharged home. DESIGN: Longitudinal, prospective 1-group cohort study of patients followed approximately 2 weeks after hospital discharge and then 3 months after the initial home visit. SETTING: Follow-up visits conducted in patients{\textquoteright} home setting. PARTICIPANTS: Ninety-four adults who were recently discharged from inpatient rehabilitation, with diagnoses of neurologic, orthopedic, and medically complex conditions. INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Summary scores from AM-PAC-CAT, including 3 activity domains of movement and physical, personal care and instrumental, and applied cognition were compared with scores from a traditional fixed-length version of the AM-PAC with 66 items (AM-PAC-66). RESULTS: AM-PAC-CAT scores were in good agreement (intraclass correlation coefficient model 3,1 range, .77-.86) with scores from the AM-PAC-66. On average, the CAT programs required 43\% of the time and 33\% of the items compared with the AM-PAC-66. Both formats discriminated across functional severity groups. The standardized response mean (SRM) was greater for the movement and physical fixed form than the CAT; the effect size and SRM of the 2 other AM-PAC domains showed similar sensitivity between CAT and fixed formats. Using patients{\textquoteright} own report as an anchor-based measure of change, the CAT and fixed length formats were comparable in responsiveness to patient-reported change over a 3-month interval. CONCLUSIONS: Accurate estimates for functional activity group-level changes can be obtained from CAT administrations, with a considerable reduction in administration time.}, keywords = {*Activities of Daily Living, *Adaptation, Physiological, *Computer Systems, *Questionnaires, Adult, Aged, Aged, 80 and over, Chi-Square Distribution, Factor Analysis, Statistical, Female, Humans, Longitudinal Studies, Male, Middle Aged, Outcome Assessment (Health Care)/*methods, Patient Discharge, Prospective Studies, Rehabilitation/*standards, Subacute Care/*standards}, isbn = {0003-9993 (Print)}, author = {Haley, S. M. and Siebens, H. and Coster, W. J. and Tao, W. and Black-Schaffer, R. M. and Gandek, B. and Sinclair, S. J. and Ni, P.} } @article {174, title = {Measurement precision and efficiency of multidimensional computer adaptive testing of physical functioning using the pediatric evaluation of disability inventory}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {87}, number = {9}, year = {2006}, note = {Haley, Stephen MNi, PengshengLudlow, Larry HFragala-Pinkham, Maria AK02 hd45354-01/hd/nichdResearch Support, N.I.H., ExtramuralResearch Support, Non-U.S. Gov{\textquoteright}tUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2006 Sep;87(9):1223-9.}, month = {Sep}, pages = {1223-9}, edition = {2006/08/29}, abstract = {OBJECTIVE: To compare the measurement efficiency and precision of a multidimensional computer adaptive testing (M-CAT) application to a unidimensional CAT (U-CAT) comparison using item bank data from 2 of the functional skills scales of the Pediatric Evaluation of Disability Inventory (PEDI). DESIGN: Using existing PEDI mobility and self-care item banks, we compared the stability of item calibrations and model fit between unidimensional and multidimensional Rasch models and compared the efficiency and precision of the U-CAT- and M-CAT-simulated assessments to a random draw of items. SETTING: Pediatric rehabilitation hospital and clinics. PARTICIPANTS: Clinical and normative samples. INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Not applicable. RESULTS: The M-CAT had greater levels of precision and efficiency than the separate mobility and self-care U-CAT versions when using a similar number of items for each PEDI subdomain. Equivalent estimation of mobility and self-care scores can be achieved with a 25\% to 40\% item reduction with the M-CAT compared with the U-CAT. CONCLUSIONS: M-CAT applications appear to have both precision and efficiency advantages compared with separate U-CAT assessments when content subdomains have a high correlation. Practitioners may also realize interpretive advantages of reporting test score information for each subdomain when separate clinical inferences are desired.}, keywords = {*Disability Evaluation, *Pediatrics, Adolescent, Child, Child, Preschool, Computers, Disabled Persons/*classification/rehabilitation, Efficiency, Humans, Infant, Outcome Assessment (Health Care), Psychometrics, Self Care}, isbn = {0003-9993 (Print)}, author = {Haley, S. M. and Ni, P. and Ludlow, L. H. and Fragala-Pinkham, M. A.} } @article {181, title = {Optimal and nonoptimal computer-based test designs for making pass-fail decisions}, journal = {Applied Measurement in Education}, volume = {19}, number = {3}, year = {2006}, pages = {221-239}, publisher = {Lawrence Erlbaum: US}, abstract = {Now that many credentialing exams are being routinely administered by computer, new computer-based test designs, along with item response theory models, are being aggressively researched to identify specific designs that can increase the decision consistency and accuracy of pass-fail decisions. The purpose of this study was to investigate the impact of optimal and nonoptimal multistage test (MST) designs, linear parallel-form test designs (LPFT), and computer adaptive test (CAT) designs on the decision consistency and accuracy of pass-fail decisions. Realistic testing situations matching those of one of the large credentialing agencies were simulated to increase the generalizability of the findings. The conclusions were clear: (a) With the LPFTs, matching test information functions (TIFs) to the mean of the proficiency distribution produced slightly better results than matching them to the passing score; (b) all of the test designs worked better than test construction using random selection of items, subject to content constraints only; (c) CAT performed better than the other test designs; and (d) if matching a TIP to the passing score, the MST design produced a bit better results than the LPFT design. If an argument for the MST design is to be made, it can be made on the basis of slight improvements over the LPFT design and better expected item bank utilization, candidate preference, and the potential for improved diagnostic feedback, compared with the feedback that is possible with fixed linear test forms. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {adaptive test, credentialing exams, Decision Making, Educational Measurement, multistage tests, optimal computer-based test designs, test form}, isbn = {0895-7347 (Print); 1532-4818 (Electronic)}, author = {Hambleton, R. K. and Xing, D.} } @article {175, title = {Assessing mobility in children using a computer adaptive testing version of the pediatric evaluation of disability inventory}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {86}, number = {5}, year = {2005}, note = {Haley, Stephen MRaczek, Anastasia ECoster, Wendy JDumas, Helene MFragala-Pinkham, Maria AK02 hd45354-01a1/hd/nichdR43 hd42388-01/hd/nichdResearch Support, N.I.H., ExtramuralResearch Support, U.S. Gov{\textquoteright}t, P.H.S.United StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2005 May;86(5):932-9.}, month = {May}, pages = {932-9}, edition = {2005/05/17}, abstract = {OBJECTIVE: To assess score agreement, validity, precision, and response burden of a prototype computerized adaptive testing (CAT) version of the Mobility Functional Skills Scale (Mob-CAT) of the Pediatric Evaluation of Disability Inventory (PEDI) as compared with the full 59-item version (Mob-59). DESIGN: Computer simulation analysis of cross-sectional and longitudinal retrospective data; and cross-sectional prospective study. SETTING: Pediatric rehabilitation hospital, including inpatient acute rehabilitation, day school program, outpatient clinics, community-based day care, preschool, and children{\textquoteright}s homes. PARTICIPANTS: Four hundred sixty-nine children with disabilities and 412 children with no disabilities (analytic sample); 41 children without disabilities and 39 with disabilities (cross-validation sample). INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Summary scores from a prototype Mob-CAT application and versions using 15-, 10-, and 5-item stopping rules; scores from the Mob-59; and number of items and time (in seconds) to administer assessments. RESULTS: Mob-CAT scores from both computer simulations (intraclass correlation coefficient [ICC] range, .94-.99) and field administrations (ICC=.98) were in high agreement with scores from the Mob-59. Using computer simulations of retrospective data, discriminant validity, and sensitivity to change of the Mob-CAT closely approximated that of the Mob-59, especially when using the 15- and 10-item stopping rule versions of the Mob-CAT. The Mob-CAT used no more than 15\% of the items for any single administration, and required 20\% of the time needed to administer the Mob-59. CONCLUSIONS: Comparable score estimates for the PEDI mobility scale can be obtained from CAT administrations, with losses in validity and precision for shorter forms, but with a considerable reduction in administration time.}, keywords = {*Computer Simulation, *Disability Evaluation, Adolescent, Child, Child, Preschool, Cross-Sectional Studies, Disabled Children/*rehabilitation, Female, Humans, Infant, Male, Outcome Assessment (Health Care)/*methods, Rehabilitation Centers, Rehabilitation/*standards, Sensitivity and Specificity}, isbn = {0003-9993 (Print)}, author = {Haley, S. M. and Raczek, A. E. and Coster, W. J. and Dumas, H. M. and Fragala-Pinkham, M. A.} } @article {171, title = {A computer adaptive testing approach for assessing physical functioning in children and adolescents}, journal = {Developmental Medicine and Child Neuropsychology}, volume = {47}, number = {2}, year = {2005}, note = {Haley, Stephen MNi, PengshengFragala-Pinkham, Maria ASkrinar, Alison MCorzo, DeyaniraComparative StudyResearch Support, Non-U.S. Gov{\textquoteright}tEnglandDevelopmental medicine and child neurologyDev Med Child Neurol. 2005 Feb;47(2):113-20.}, month = {Feb}, pages = {113-120}, edition = {2005/02/15}, abstract = {The purpose of this article is to demonstrate: (1) the accuracy and (2) the reduction in amount of time and effort in assessing physical functioning (self-care and mobility domains) of children and adolescents using computer-adaptive testing (CAT). A CAT algorithm selects questions directly tailored to the child{\textquoteright}s ability level, based on previous responses. Using a CAT algorithm, a simulation study was used to determine the number of items necessary to approximate the score of a full-length assessment. We built simulated CAT (5-, 10-, 15-, and 20-item versions) for self-care and mobility domains and tested their accuracy in a normative sample (n=373; 190 males, 183 females; mean age 6y 11mo [SD 4y 2m], range 4mo to 14y 11mo) and a sample of children and adolescents with Pompe disease (n=26; 21 males, 5 females; mean age 6y 1mo [SD 3y 10mo], range 5mo to 14y 10mo). Results indicated that comparable score estimates (based on computer simulations) to the full-length tests can be achieved in a 20-item CAT version for all age ranges and for normative and clinical samples. No more than 13 to 16\% of the items in the full-length tests were needed for any one administration. These results support further consideration of using CAT programs for accurate and efficient clinical assessments of physical functioning.}, keywords = {*Computer Systems, Activities of Daily Living, Adolescent, Age Factors, Child, Child Development/*physiology, Child, Preschool, Computer Simulation, Confidence Intervals, Demography, Female, Glycogen Storage Disease Type II/physiopathology, Health Status Indicators, Humans, Infant, Infant, Newborn, Male, Motor Activity/*physiology, Outcome Assessment (Health Care)/*methods, Reproducibility of Results, Self Care, Sensitivity and Specificity}, isbn = {0012-1622 (Print)}, author = {Haley, S. M. and Ni, P. and Fragala-Pinkham, M. A. and Skrinar, A. M. and Corzo, D.} } @article {211, title = {Contemporary measurement techniques for rehabilitation outcomes assessment}, journal = {Journal of Rehabilitation Medicine}, volume = {37}, number = {6}, year = {2005}, note = {1650-1977 (Print)Journal ArticleReview}, pages = {339-345}, abstract = {In this article, we review the limitations of traditional rehabilitation functional outcome instruments currently in use within the rehabilitation field to assess Activity and Participation domains as defined by the International Classification of Function, Disability, and Health. These include a narrow scope of functional outcomes, data incompatibility across instruments, and the precision vs feasibility dilemma. Following this, we illustrate how contemporary measurement techniques, such as item response theory methods combined with computer adaptive testing methodology, can be applied in rehabilitation to design functional outcome instruments that are comprehensive in scope, accurate, allow for compatibility across instruments, and are sensitive to clinically important change without sacrificing their feasibility. Finally, we present some of the pressing challenges that need to be overcome to provide effective dissemination and training assistance to ensure that current and future generations of rehabilitation professionals are familiar with and skilled in the application of contemporary outcomes measurement.}, keywords = {*Disability Evaluation, Activities of Daily Living/classification, Disabled Persons/classification/*rehabilitation, Health Status Indicators, Humans, Outcome Assessment (Health Care)/*methods/standards, Recovery of Function, Research Support, N.I.H., Extramural, Research Support, U.S. Gov{\textquoteright}t, Non-P.H.S., Sensitivity and Specificity computerized adaptive testing}, author = {Jette, A. M. and Haley, S. M.} } @article {168, title = {Activity outcome measurement for postacute care}, journal = {Medical Care}, volume = {42}, number = {1 Suppl}, year = {2004}, note = {0025-7079Journal ArticleMulticenter Study}, pages = {I49-161}, abstract = {BACKGROUND: Efforts to evaluate the effectiveness of a broad range of postacute care services have been hindered by the lack of conceptually sound and comprehensive measures of outcomes. It is critical to determine a common underlying structure before employing current methods of item equating across outcome instruments for future item banking and computer-adaptive testing applications. OBJECTIVE: To investigate the factor structure, reliability, and scale properties of items underlying the Activity domains of the International Classification of Functioning, Disability and Health (ICF) for use in postacute care outcome measurement. METHODS: We developed a 41-item Activity Measure for Postacute Care (AM-PAC) that assessed an individual{\textquoteright}s execution of discrete daily tasks in his or her own environment across major content domains as defined by the ICF. We evaluated the reliability and discriminant validity of the prototype AM-PAC in 477 individuals in active rehabilitation programs across 4 rehabilitation settings using factor analyses, tests of item scaling, internal consistency reliability analyses, Rasch item response theory modeling, residual component analysis, and modified parallel analysis. RESULTS: Results from an initial exploratory factor analysis produced 3 distinct, interpretable factors that accounted for 72\% of the variance: Applied Cognition (44\%), Personal Care \& Instrumental Activities (19\%), and Physical \& Movement Activities (9\%); these 3 activity factors were verified by a confirmatory factor analysis. Scaling assumptions were met for each factor in the total sample and across diagnostic groups. Internal consistency reliability was high for the total sample (Cronbach alpha = 0.92 to 0.94), and for specific diagnostic groups (Cronbach alpha = 0.90 to 0.95). Rasch scaling, residual factor, differential item functioning, and modified parallel analyses supported the unidimensionality and goodness of fit of each unique activity domain. CONCLUSIONS: This 3-factor model of the AM-PAC can form the conceptual basis for common-item equating and computer-adaptive applications, leading to a comprehensive system of outcome instruments for postacute care settings.}, keywords = {*Self Efficacy, *Sickness Impact Profile, Activities of Daily Living/*classification/psychology, Adult, Aftercare/*standards/statistics \& numerical data, Aged, Boston, Cognition/physiology, Disability Evaluation, Factor Analysis, Statistical, Female, Human, Male, Middle Aged, Movement/physiology, Outcome Assessment (Health Care)/*methods/statistics \& numerical data, Psychometrics, Questionnaires/standards, Rehabilitation/*standards/statistics \& numerical data, Reproducibility of Results, Sensitivity and Specificity, Support, U.S. Gov{\textquoteright}t, Non-P.H.S., Support, U.S. Gov{\textquoteright}t, P.H.S.}, author = {Haley, S. M. and Coster, W. J. and Andres, P. L. and Ludlow, L. H. and Ni, P. and Bond, T. L. and Sinclair, S. J. and Jette, A. M.} } @article {87, title = {Refining the conceptual basis for rehabilitation outcome measurement: personal care and instrumental activities domain}, journal = {Medical Care}, volume = {42}, number = {1 Suppl}, year = {2004}, note = {0025-7079Journal Article}, month = {Jan}, pages = {I62-172}, abstract = {BACKGROUND: Rehabilitation outcome measures routinely include content on performance of daily activities; however, the conceptual basis for item selection is rarely specified. These instruments differ significantly in format, number, and specificity of daily activity items and in the measurement dimensions and type of scale used to specify levels of performance. We propose that a requirement for upper limb and hand skills underlies many activities of daily living (ADL) and instrumental activities of daily living (IADL) items in current instruments, and that items selected based on this definition can be placed along a single functional continuum. OBJECTIVE: To examine the dimensional structure and content coverage of a Personal Care and Instrumental Activities item set and to examine the comparability of items from existing instruments and a set of new items as measures of this domain. METHODS: Participants (N = 477) from 3 different disability groups and 4 settings representing the continuum of postacute rehabilitation care were administered the newly developed Activity Measure for Post-Acute Care (AM-PAC), the SF-8, and an additional setting-specific measure: FIM (in-patient rehabilitation); MDS (skilled nursing facility); MDS-PAC (postacute settings); OASIS (home care); or PF-10 (outpatient clinic). Rasch (partial-credit model) analyses were conducted on a set of 62 items covering the Personal Care and Instrumental domain to examine item fit, item functioning, and category difficulty estimates and unidimensionality. RESULTS: After removing 6 misfitting items, the remaining 56 items fit acceptably along the hypothesized continuum. Analyses yielded different difficulty estimates for the maximum score (eg, "Independent performance") for items with comparable content from different instruments. Items showed little differential item functioning across age, diagnosis, or severity groups, and 92\% of the participants fit the model. CONCLUSIONS: ADL and IADL items from existing rehabilitation outcomes instruments that depend on skilled upper limb and hand use can be located along a single continuum, along with the new personal care and instrumental items of the AM-PAC addressing gaps in content. Results support the validity of the proposed definition of the Personal Care and Instrumental Activities dimension of function as a guide for future development of rehabilitation outcome instruments, such as linked, setting-specific short forms and computerized adaptive testing approaches.}, keywords = {*Self Efficacy, *Sickness Impact Profile, Activities of Daily Living/*classification/psychology, Adult, Aged, Aged, 80 and over, Disability Evaluation, Factor Analysis, Statistical, Female, Humans, Male, Middle Aged, Outcome Assessment (Health Care)/*methods/statistics \& numerical data, Questionnaires/*standards, Recovery of Function/physiology, Rehabilitation/*standards/statistics \& numerical data, Reproducibility of Results, Research Support, U.S. Gov{\textquoteright}t, Non-P.H.S., Research Support, U.S. Gov{\textquoteright}t, P.H.S., Sensitivity and Specificity}, author = {Coster, W. J. and Haley, S. M. and Andres, P. L. and Ludlow, L. H. and Bond, T. L. and Ni, P. S.} } @article {167, title = {Score comparability of short forms and computerized adaptive testing: Simulation study with the activity measure for post-acute care}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {85}, number = {4}, year = {2004}, note = {Haley, Stephen MCoster, Wendy JAndres, Patricia LKosinski, MarkNi, PengshengR01 hd43568/hd/nichdComparative StudyMulticenter StudyResearch Support, U.S. Gov{\textquoteright}t, Non-P.H.S.Research Support, U.S. Gov{\textquoteright}t, P.H.S.United StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2004 Apr;85(4):661-6.}, month = {Apr}, pages = {661-6}, edition = {2004/04/15}, abstract = {OBJECTIVE: To compare simulated short-form and computerized adaptive testing (CAT) scores to scores obtained from complete item sets for each of the 3 domains of the Activity Measure for Post-Acute Care (AM-PAC). DESIGN: Prospective study. SETTING: Six postacute health care networks in the greater Boston metropolitan area, including inpatient acute rehabilitation, transitional care units, home care, and outpatient services. PARTICIPANTS: A convenience sample of 485 adult volunteers who were receiving skilled rehabilitation services. INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Inpatient and community-based short forms and CAT applications were developed for each of 3 activity domains (physical \& mobility, personal care \& instrumental, applied cognition) using item pools constructed from new items and items from existing postacute care instruments. RESULTS: Simulated CAT scores correlated highly with score estimates from the total item pool in each domain (4- and 6-item CAT r range,.90-.95; 10-item CAT r range,.96-.98). Scores on the 10-item short forms constructed for inpatient and community settings also provided good estimates of the AM-PAC item pool scores for the physical \& movement and personal care \& instrumental domains, but were less consistent in the applied cognition domain. Confidence intervals around individual scores were greater in the short forms than for the CATs. CONCLUSIONS: Accurate scoring estimates for AM-PAC domains can be obtained with either the setting-specific short forms or the CATs. The strong relationship between CAT and item pool scores can be attributed to the CAT{\textquoteright}s ability to select specific items to match individual responses. The CAT may have additional advantages over short forms in practicality, efficiency, and the potential for providing more precise scoring estimates for individuals.}, keywords = {Boston, Factor Analysis, Statistical, Humans, Outcome Assessment (Health Care)/*methods, Prospective Studies, Questionnaires/standards, Rehabilitation/*standards, Subacute Care/*standards}, isbn = {0003-9993 (Print)}, author = {Haley, S. M. and Coster, W. J. and Andres, P. L. and Kosinski, M. and Ni, P.} } @article {191, title = {Item response theory and health outcomes measurement in the 21st century}, journal = {Medical Care}, volume = {38}, number = {9 Suppl II}, year = {2000}, note = {204349670025-7079Journal Article}, pages = {II28-II42}, abstract = {Item response theory (IRT) has a number of potential advantages over classical test theory in assessing self-reported health outcomes. IRT models yield invariant item and latent trait estimates (within a linear transformation), standard errors conditional on trait level, and trait estimates anchored to item content. IRT also facilitates evaluation of differential item functioning, inclusion of items with different response formats in the same scale, and assessment of person fit and is ideally suited for implementing computer adaptive testing. Finally, IRT methods can be helpful in developing better health outcome measures and in assessing change over time. These issues are reviewed, along with a discussion of some of the methodological and practical challenges in applying IRT methods.}, keywords = {*Models, Statistical, Activities of Daily Living, Data Interpretation, Statistical, Health Services Research/*methods, Health Surveys, Human, Mathematical Computing, Outcome Assessment (Health Care)/*methods, Research Design, Support, Non-U.S. Gov{\textquoteright}t, Support, U.S. Gov{\textquoteright}t, P.H.S., United States}, author = {Hays, R. D. and Morales, L. S. and Reise, S. P.} } @article {28, title = {Competency gradient for child-parent centers}, journal = {Journal of Outcomes Measurement}, volume = {3}, number = {1}, year = {1999}, note = {1090-655X (Print)Journal ArticleResearch Support, U.S. Gov{\textquoteright}t, P.H.S.}, pages = {35-52}, abstract = {This report describes an implementation of the Rasch model during the longitudinal evaluation of a federally-funded early childhood preschool intervention program. An item bank is described for operationally defining a psychosocial construct called community life-skills competency, an expected teenage outcome of the preschool intervention. This analysis examined the position of teenage students on this scale structure, and investigated a pattern of cognitive operations necessary for students to pass community life-skills test items. Then this scale structure was correlated with nationally standardized reading and math achievement scores, teacher ratings, and school records to assess its validity as a measure of the community-related outcome goal for this intervention. The results show a functional relationship between years of early intervention and magnitude of effect on the life-skills competency variable.}, keywords = {*Models, Statistical, Activities of Daily Living/classification/psychology, Adolescent, Chicago, Child, Child, Preschool, Early Intervention (Education)/*statistics \& numerical data, Female, Follow-Up Studies, Humans, Male, Outcome and Process Assessment (Health Care)/*statistics \& numerical data}, author = {Bezruczko, N.} } @article {419, title = {The use of Rasch analysis to produce scale-free measurement of functional ability}, journal = {American Journal of Occupational Therapy}, volume = {53}, number = {1}, year = {1999}, note = {991250470272-9490Journal Article}, pages = {83-90}, abstract = {Innovative applications of Rasch analysis can lead to solutions for traditional measurement problems and can produce new assessment applications in occupational therapy and health care practice. First, Rasch analysis is a mechanism that translates scores across similar functional ability assessments, thus enabling the comparison of functional ability outcomes measured by different instruments. This will allow for the meaningful tracking of functional ability outcomes across the continuum of care. Second, once the item-difficulty order of an instrument or item bank is established by Rasch analysis, computerized adaptive testing can be used to target items to the patient{\textquoteright}s ability level, reducing assessment length by as much as one half. More importantly, Rasch analysis can provide the foundation for "equiprecise" measurement or the potential to have precise measurement across all levels of functional ability. The use of Rasch analysis to create scale-free measurement of functional ability demonstrates how this methodlogy can be used in practical applications of clinical and outcome assessment.}, keywords = {*Activities of Daily Living, Disabled Persons/*classification, Human, Occupational Therapy/*methods, Predictive Value of Tests, Questionnaires/standards, Sensitivity and Specificity}, author = {Velozo, C. A. and Kielhofner, G. and Lai, J-S.} }