Same frequency distribution

by Irina 7. July 2007 10:55

A frequency distribution is a summary of the data set in which the interval of possible values is divided into subintervals, known as classes. For each class, the number of data values in that class is recorded; this is the frequency of the class. The relative frequency of the class is the frequency of the class divided by the number of values in the data set.

Sometimes we want to compare two populations and we want choose from the second population  observations with the  frequency distiribution that is similar to the first population.

Example: 

data a7;
set loans.Ekspr_model_result_200704;
if zevet=7;
code=km_L_PASIV_month_b*10+kmin_age;
run;

data a3;
set loans.Ekspr_model_result_200704;
if zevet=3;
code=km_L_PASIV_month_b*10+kmin_age;
run;

We want sample from a3 with the frequency distribution according to pasiv and age that the similar to a7.
proc freq DATA=a7; 
tables code /out=outkod  noprint;
run;

proc freq DATA=a3; 
tables code /out=outkod3(rename=(percent=percent3 count=count3))  noprint;
run;


proc sql ;
create table all_res as
seect a.*,count3/new_am as ratio_lacking,
min(count3/new_am) as min_ratio,
from 
(
select a.*,
b.percent3,
b.count3,
sum(count3) as counter,
int(sum(count3)*percent/100) as new_am,

percent/percent3 as ratio 
from outkod a,
outkod3 b
where a.code=b.code ) a;
quit;


data _null_ ; 
set all_res ;
if min_ratio=ratio_lacking;
num_pop=round(count3*100/percent);
call symput('nn',num_pop);
run;

%put &nn;

data all_res1;
set all_res;
new_new=round(&nn*percent/100);
ratio_new_new=  (new_new/count3)   ;
round_ratio_new=round(ratio_new_new,0.0001);
run;

proc sql noprint;
select round_ratio_new
into :ratio separated BY " " 

from all_res1; 
quit;

proc surveyselect data=a3 method=srs
                  rate=(&ratio) out=sample;
strata code;
run;

proc freq data=a7;
tables code;
run;

proc freq data=sample;
tables code;
run;

The way of creating the same distribution

by Irina 14. April 2007 06:41

Sometimes We want to create the same distribution .

We can do it in this way : The ttt limits the commulative distribution of another variable, which we can receive from proc freq )

DATA  SAME_DISTR;
SET SAME_DISTR;
IF TARGET =0 THEN DO;
ttt=ranuni(31311115)*100;
if ttt<=5 then kod=200502;
else if ttt<=28 then kod=200503;
else if ttt<=40 then kod=200504;
else if ttt<=52 then kod=200505;
else if ttt<=62 then kod=200506;
else if ttt<=71 then kod=200507;
else if ttt<=80 then kod=200508;
else if ttt<=88 then kod=200509;
else if ttt<=93 then kod=200510;
else if ttt<=100 then kod=200511;
end;
if target=1 then kod=100* year( Loan_Value_Date)+month(Loan_Value_Date);
run;

This is more wise way to do the same:


proc freq DATA = required_destribution;
tables Loan_Value_Date /out = outkod outcum noprint;
run; data _null_ ; length kod_str pct_str $5000;
set outkod end=eof;
retain kod_str pct_str;
kod_str=compress(kod_str||','||Loan_Value_Date);
pct_str=compress(pct_str||','||cum_pct);
if eof then do;
call symput('a1',substr(pct_str,2));
call symput('a2',substr(kod_str,2));
call symput('nn',_n_);
end;
run;

DATA SAME_DISTR;
SET SAME_DISTR;
array a1 {&nn} _temporary_ (&a1);
array a2 {&nn} _temporary_ (&a2);
IF TARGET =0 THEN DO;
ttt=ranuni(31311115)*100;
do i=1 to dim(a1);
if i=1 then do;
if ttt<=a1[i] then Loan_Value_Date=a2[i];
end;
else do;
if a1[i-1]<ttt<=a1[i] then Loan_Value_Date=a2[i];
end;
end;
end;

kod=100* year( Loan_Value_Date)+month(Loan_Value_Date);
run;

About the author

Irina Spivak Irina Spivak
Team Leader at G-Stat. More...


Send mail Email

Authors

Blogroll

    Disclaimer

    The opinions expressed herein are my own personal opinions and do not represent my employer's view in anyway.

    © Copyright 2010

    Sign in

    eXTReMe Tracker