#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #+ #+ Mumps Information Storage and Retrieval Software Library #+ Copyright (C) 2005 by Kevin C. O'Kane #+ #+ Kevin C. O'Kane #+ okane@cs.uni.edu #+ #+ #+ This program is free software; you can redistribute it and/or modify #+ it under the terms of the GNU General Public License as published by #+ the Free Software Foundation; either version 2 of the License, or #+ (at your option) any later version. #+ #+ This program is distributed in the hope that it will be useful, #+ but WITHOUT ANY WARRANTY; without even the implied warranty of #+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #+ GNU General Public License for more details. #+ #+ You should have received a copy of the GNU General Public License #+ along with this program; if not, write to the Free Software #+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #+ #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # prox # ttx term-term correlation matrix # calculate term-term proximity coefficients within env words kill ^tt //* delete any old term-term correlation matrix write !!,"Term-Term Correlation [ttx.mps] ",$zd,! #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # for each document k, sum the co-occurrences of words i and j #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ set Wgt=0 set Wgtx=0 Open 1:"ttx.tmp,new" # for each document k set k="" for do . set k=$order(^p(k)) . if k="" break # for each term i in p k . set i="" . for do .. set i=$order(^p(k,i)) .. if i="" break # for each other term j in doc k .. set j=i .. for do ... set j=$order(^p(k,j)) ... if j="" break # for each position m of term i in doc k ... set m="" ... for do .... set m=$order(^p(k,i,m)) .... if m="" break # for each position n of term j in doc k .... set n="" .... for do ..... set n=$order(^p(k,j,n)) ..... if n="" break # calculate and store weight based on proximity ..... set dd=$zlog(1/$zabs(m-n)*20+1)\1 ..... if dd<1 quit ..... if '$Data(^tt(i,j)) do ...... set ^tt(i,j)=dd ...... set ^tt(i,j,1)=n-m //* which word is first (+/-) ...... set Wgtx=Wgtx+1 ...... set Wgt=Wgt+dd ..... else do ...... set ^tt(i,j)=^tt(i,j)+dd ...... set ^tt(i,j,1)=^tt(i,j,1)+(n-m) //* avg which word is first (+/-) ...... set Wgt=Wgt+dd do graph # normalize set max=0 set i="" for do . set i=$order(^tt(i)) . if i="" break . set j=i . for do .. set j=$order(^tt(i,j)) .. if j="" break .. if ^tt(i,j)>max set max=^tt(i,j) # set max=WgtFactor/100*dm\1 set max=max*.1\1 # build other diagonal matrix set i="" for do . set i=$order(^tt(i)) . if i="" break . set j=i . for do .. set j=$order(^tt(i,j)) .. if j="" break .. if ^tt(i,j)65 !," " ... write " ",j,"[",^tt(i,j),"]" ... if j]i do .... Use 1 .... write $Justify(^tt(i,j),6)," ",i," ",j,! .... Use 5 write !! Close 1 set i=$zsystem("sort -n -r ttx.rank") //* shell command set i=$zsystem("rm ttx.tmp") //* shell command write "Dump data: ",$zcd("ttx.dmp"),! write $zd,! halt graph write !,"Graphs",! kill ^hx kill ^hxx kill ^hr set i="" set k=0 for do . set i=$order(^tt(i)) . if i="" break . set j="" . for do .. set j=$order(^tt(i,j)) .. if j="" break .. set k=k+1 .. set ^hr(k)=^tt(i,j) set i=0 set dm=0 for do . set i=$order(^hr(i)) . if i="" break . if ^hr(i)>dm set dm=^hr(i) for j=1:1:dm set ^hx(j)=0 set i=0 for do . set i=$order(^hr(i)) . if i="" break . set j=^hr(i) . set ^hx(j)=^hx(j)+1 set hxmax=0 set j=$order(^hx("")) for i=j:1:dm set ^hxx(i)=^hx(i) set ^hx(i)=$zlog(^hx(i)+1) if ^hx(i)>hxmax set hxmax=^hx(i) write !,"Logarithmic Histogram Showing Number of Term Pairs for levels of Correlation Strength",!! write " Corr Words",!! set j=$order(^hx("")) for i=j:1:dm do . set k=^hx(i)/hxmax*100\1 . if ^hx(i)>0 do .. set k=k+1 .. write $Justify(i,5),$j(^hxx(i),7)," " .. for m=1:1:k write "*" .. write ! quit