Uwe Siebert

Real World Health Care Data Analysis


Скачать книгу

      Macro: MP_ASSIGN

      Purpose: Find and create pooled missing value patterns

      ******************************************************************************;

      * Input parameters:

      * indata = input data set

      * outdata = output data set

      * varlist = a list of variables to be included in the propensity score

      * estimation. Notice the variable type should be the same.

      * M_MP_MIN = minimum number of observations for each missing pattern.

      * Missing patterns with less than MIN_MP observations will be pooled.

      ******************************************************************************;

      %MACRO MP_ASSIGN(MSDATA = , OUTDATA =, VARLIST =, N_MP_MIN = 100);

      /* Determine how many variables to include in the propensity score estimation */

      %LET N = 1;

      %LET VARINT = ;

      %DO %UNTIL(%QSCAN(&VARLIST., &N. , %STR( )) EQ %STR( ));

      %LET VAR = %QSCAN(&VARLIST. , &N. , %STR( ));

      %LET VARINT = &VARINT &VAR.*MP;

      %LET N = %EVAL(&N. + 1);

      %END;

      %LET KO = %EVAL(&N-1);

      %LET M_MISSING = %EVAL(&N-1);

      %PUT &VARINT;

      %PUT &KO;

      %PUT &M_MISSING;

      /* Create indicators for missing values and missingness patterns */

      DATA MS;

      SET &MSDATA;

      ARRAY MS{&M_MISSING} M1-M&M_MISSING.;

      ARRAY X{&M_MISSING} &VARLIST;

      MV = 0;

      DO I = 1 TO &M_MISSING;

      IF X{I} = . THEN MS{I} = 1;

      ELSE MS{I} = 0;

      MV = 2*MV + MS{I};

      END;

      MV = MV + 1;

      DROP I;

      RUN;

      /* Only keep one record for each missingness pattern */

      PROC SORT DATA = MS OUT = PATTERN NODUPKEY;

      BY MV;

      RUN;

      /* Calculate the number of observations in each missingness pattern */

      PROC FREQ DATA = MS NOPRINT;

      TABLES MV / OUT = M_MP(KEEP = MV COUNT);

      RUN;

      DATA PATTERN;

      MERGE PATTERN M_MP;

      BY MV;

      RUN;

      PROC SORT DATA = PATTERN;

      BY DESCENDING COUNT;

      RUN;

      /* Assign missingness pattern to new index from the largest to the smallest */

      DATA PATTERN;

      RETAIN M1-M&M_MISSING MV COUNT MV_S;

      SET PATTERN;

      KEEP M1-M&M_MISSING MV COUNT MV_S;

      MV_S = _N_;

      RUN;

      PROC IML;

      USE PATTERN;

      READ ALL INTO A;

      CLOSE PATTERN;

      MS = A[, 1:&M_MISSING];

      MV = A[, 1+&M_MISSING];

      N_MP = A[, 2+&M_MISSING];

      MV_S = A[, 3+&M_MISSING];

      M_MP = NROW(MS);

      M = NCOL(MS);

      /* Calculate the distance between missingness patterns */

      DISTANCE = J(M_MP, M_MP, 0);

      DO I = 1 TO M_MP;

      DO J = 1 TO I-1;

      D = 0;

      DO L = 1 TO M;

      D = D + ( (MS[I,L]-MS[J,L])*(MS[I,L]-MS[J,L]) );

      END;

      DISTANCE[I,J] = D;

      DISTANCE[J,I] = D;

      END;

      END;

      I = 0;

      K_MV_POOL = 0;

      MV_POOL = J(M_MP, 1, 0);

      /*Pooling small missingness patterns according to their similarities to reach a prespecified minimum number of observations (&N_MP_MIN) in each pattern */

      DO WHILE( I < M_MP);

      I = I + 1;

      IF MV_POOL[I] = 0 THEN

      DO;

      K_MV_POOL = K_MV_POOL + 1;

      N_MP_POOL = N_MP[I];

      IF N_MP_POOL >= &N_MP_MIN THEN

      DO;

      MV_POOL[I] = K_MV_POOL;

      END;

      ELSE

      DO;

      IF I < M_MP THEN

      DO;

      A = DISTANCE[(I+1):M_MP, I];

      B = MV[(I+1):M_MP];

      C = N_MP[(I+1):M_MP];

      D = MV_S[(I+1):M_MP];

      E = MV_POOL[(I+1):M_MP];

      TT = A || B || C || D || E;

      CALL SORT( TT, {1 3});

      J = 0;

      DO WHILE( (N_MP_POOL < &N_MP_MIN) & (I+J < M_MP) );

      J = J+1;

      IF (TT[J,5] = 0) THEN

      DO;

      N_MP_POOL = N_MP_POOL + TT[J,3];

      TT[J,5] = K_MV_POOL;

      END;

      END;

      END;

      IF ( N_MP_POOL >= &N_MP_MIN ) THEN

      DO;

      MV_POOL[I] = K_MV_POOL;

      DO K = 1 TO J;

      MV_POOL[TT[K,4]] = K_MV_POOL;

      END;

      END;

      ELSE

      DO J = I TO M_MP;

      SGN_TMP = 0;

      K = 1;

      DO WHILE(SGN_TMP = 0 & K <= M_MP);

      DO L = 1 TO M_MP;

      IF (DISTANCE[J,L] = K) & (MV_POOL[J]=0) &

      (MV_POOL[L]>0) THEN

      DO;

      MV_POOL[J] = MV_POOL[L];

      SGN_TMP = 1;

      END;

      END;