Question : Python script to use criteria to select from data frame and output to file

Hello,

I have a VB script that examines my data frame using two criteria and outputs selections based on them. My data frame is a distance matrix. I want to add randomness to the script to solve some problems with location exclusion and to create an output that is satisfactory for future statistical analyses. I was wondering if Python might be the way instead to complete this task.

The attached script uses these first two criteria to make a selection from the data frame:

1.) To be selected the distance between any pair of locations must be, at least, a specified distance.
2.) Preference is given in the selection of locations based upon a priority ranking. (i.e. in my data frame the priority ranking is expressed by the digit after the decimal for a location (e.g. X.1>X.2 >X.3))

I would like the above to happen as well as the insertion of a random component to a script. To preserve the priority preference I thought the random component could be 'constrained' to only selecting from the most preferred priority until no more locations can be selected based on the distance criterion. Then the same procedure would move on to the subsequent priority rankings to finish create the output list. The script could then produce several different outputs.

Thank you kindly,
I appreciate annotated code if possible,
JE

---This is yet another incarnation of this question. I hope its clearer. I have attached some visuals (pdf) to try to help explain whats going on and what I want. I hope the experts out there are not fatigued by this question. I really am trying to make it clear. Thanks.----

Refernences:
Question where script came from:
http://www.experts-exchange.com/Programming/Languages/Scripting/Python/Q_24048759.htm

Answer : Python script to use criteria to select from data frame and output to file

The islands8.py seems to work -- see below and have questions/objections.

                       
         
           
             1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
109:
110:
111:
112:
113:
114:
115:
116:
117:
118:
119:
120:
121:
122:
123:
124:
125:
126:
127:
128:
129:
130:
131:
132:
133:
134:
135:
136:
137:
138:
139:
140:
141:
142:
143:
144:
145:
146:
147:
148:
149:
150:
151:
152:
153:
154:
155:
156:
157:
158:
159:
160:
161:
162:
163:
164:
165:
166:
167:
168:
169:
170:
171:
172:
173:
174:
175:
176:
177:
178:
179:
180:
181:
182:
183:
184:
185:
186:
187:
188:
189:
190:
191:
192:
193:
194:
195:
196:
197:
198:
199:
200:
201:
202:
203:
204:
205:
206:
207:
208:
209:
210:
211:
212:
213:
214:
215:
216:
217:
218:
219:
220:
221:
222:
223:
224:
225:
226:
227:
228:
229:
230:
231:
232:
233:
234:
235:
236:
237:
238:
239:
240:
241:
242:
243:
244:
245:
246:
247:
248:
249:
250:
251:
252:
253:
254:
255:
256:
257:
258:
259:
260:
261:
262:
263:
264:
265:
266:
267:
268:
269:
270:
271:
272:
273:
274:
275:
276:
277:
278:
279:
280:
281:
282:
283:
284:
285:
286:
287:
288:
289:
290:
291:
292:
293:
294:
295:
296:
297:
298:

           
           
             import csv
 
inputFname = 'N-x-N-2.csv'
 
 
class Point:
    def __init__(self, compoundId, index):
        lst = compoundId.split('.')
        assert len(lst) == 2
        self.id = int(lst[0])
        self.pref = int(lst[1])
        self.index = index
    
    def __str__(self):
        '''Human readable representation of the point.'''
        return str(self.id) + '.' + str(self.pref)
 
 
def cmpPoints(a, b):
    if a.pref < b.pref:
        return -1
    elif a.pref > b.pref:
        return +1
 
    assert a.pref == b.pref
    if a.id < b.id:
        return -1
    elif a.id > b.id:
        return +1
    else: 
        return 0
 
 
def buildMatrices(fname, distance):
    fin = open(fname, 'rb')
    reader = csv.reader(fin)
 
    # Get the line with point identifications, build the point-object list.
    idlst = reader.next()
    idlst = idlst[1:]
    points = [ Point(e, n) for n, e in enumerate(idlst) ]
 
    mN = []  # the numeric matrix as the list of lists
    mB = []  # the boolean matrix as the list of lists
    for row in reader:
        lstN = []     # next row of the numeric matrix
        lstB = []     # next row of the boolean matrix
 
        for e in row[1:]:
            if e == '':
                n = 0.0
            else:
                n = float(e)
 
            lstN.append(n)
            lstB.append(n <= distance) # also the reflexivity
 
        mN.append(lstN)
        mB.append(lstB)
    fin.close() 
 
    return mN, mB, points
 
 
def subMatrix(mSrc, points):
    # The mSrc is full matrix (can be both boolean or numeric).
    # Consider only the rows and columns indexed by the points.
    # The list of points is subset of the full list of points.
    idxLst = [ p.index for p in points ]
 
    m = []  # the matrix as the list of lists
    for rowIdx in idxLst:
        lst = []             # row reduced to points columns
        row = mSrc[rowIdx]
        for colIdx in idxLst:
            lst.append(mSrc[rowIdx][colIdx])
        m.append(lst)
    return m
 
 
def copyMatrix(src):
    dst = []  # the matrix as the list of lists
    for row in src:
        dst.append(row[:])   # append copy of the list
    return dst
 
 
def transitiveClosure(m):
    '''Transitive closure of the boolean matrix (in situ).'''
    n = len(m)
    rlst = range(n)
    changed = True
    while changed:
        changed = False
        for k in rlst:
            for i in rlst:
                for j in rlst:
                    value = m[i][j] or (m[i][k] and m[k][j])
                    if value != m[i][j]:
                        m[i][j] = value
                        changed = True
    return m
 
 
def boolMatrixToFile(m, f):
    # f ... text file open for writing
    for row in m:
        lst = []
        for e in row:
            if e: 
                lst.append('@') 
            else:
                lst.append('-') 
 
        f.write(''.join(lst) + '\n')
 
 
def separateSinglesFromIslands(m, points):
    # The points have to be related to bool matrix m. 
    # The m is expected to be the full matrix.
 
    singles = []     # list of single point islands
    others = []      # list of points that are parts of islands (indices)
 
    for idx, row in enumerate(m):
        singleDetected = True  # optimistic initialization
 
        for i, v in enumerate(row):
            if v and idx != i:          # if related to another point
                singleDetected = False  # cannot be a single
                break                   # shortcut evaluation
 
        if singleDetected:
            singles.append(points[idx])
        else:
            others.append(points[idx])
 
    return singles, others
 
 
def getIslands(m, points):
    # The points list must be related to the boolean matrix m.
    # The m is expected to contain the transitive closure.
 
    processed = set()          # set of processed point indices
    islands = []               # list of lists (one list, one island of more points)
 
    for idx, row in enumerate(m):
        if idx in processed:
            continue           # already processed point (part of some island)
 
        lst = []               # init -- the island with no points
        for i, v in enumerate(row):
            if v:
                lst.append(points[i])  # point of the island
                processed.add(i)       # the point processed
 
        if len(lst) > 1: 
            islands.append(lst)        # another island
 
    return islands
 
 
def singlesToFile(points, fname):
    fout = open(fname, 'w')
    lst = [ str(p) for p in points ]
    fout.write('Singles: %s\n\n' % ', '.join(lst))
    fout.close()
 
 
def islandInfoStr(n, isle): 
    lst = [ str(p) for p in isle ]
    return 'Island %s: %s' % (n, ', '.join(lst))
 
 
def islandsToFile(islands, fname):
    fout = open(fname, 'w')
    for n, isle in enumerate(islands):
        fout.write(islandInfoStr(n, isle))
        fout.write('\n')     
    fout.close()
 
 
 
def subsolutionGenerator(points, m):
    # This is the generator that recursively searches
    # for the subsolutions related to the starting points.
    # and returns (yields) them.
    # The m must be full boolean matrix that can be indexed
    # by pt.index.
    
    # Quit the recursion if there is nothing to process.
    # Return the empty list of subsolutions in such case.
    if len(points) == 0:
        yield []
        return
    
    # Get the list of startpoints. They have all the same
    # preference as the first point (the list was sorted earlier).
    pt = points[0]
    pref = pt.pref
    startLst = [ p for p in points if p.pref == pref ]
 
    # If the startpoint "consumes" another startpoint in its
    # subsolutions, then the consumed points cannot be used as
    # starting points as they would produce the non-unique subsolutions.
    # The detection will be done through the set of consumed
    # starting points. This holds only on the same level of 
    # preference, i.e. for this recursive level.
    consumed = set()
   
    # Now start the recursive solution from each of the non-consumed
    # starting points. 
    for startPt in startLst:
        if startPt.id in consumed:
            continue              # skip the consumed starting point
 
        # The starting point was not consumed but it is going to be
        # consumed in this loop. It must not be a candidate for the
        # recursive call.
        consumed.add(startPt.id)
        
        # Get the next candidates and recursively compute 
        # subsolutions of the candidates. Then yield the concatenation
        # of the startpoint with the subsolutions of the candidates. 
        # The non-candidates are simply thrown away, because they cannot
        # be part of the subsolution with the given starting point.
        # The order of candicates must be preserved.
        candidates = []
        for pt in points:
            if pt.id in consumed:
                continue          # the consumed point cannot be the candidate
 
            if not m[startPt.index][pt.index]:   # if the points do not touch...
                candidates.append(pt)
  
        # Get the subsolutions of the candidates recursively 
        # and concatenate it with the starting point. If the subsolution
        # contains any of the starting points, they must be added to the
        # consumed set. The detection is done via the same preference
        # as the stating point has.
        for subLst in subsolutionGenerator(candidates, m):
            for p in subLst:
                if p.pref == startPt.pref:
                    consumed.add(p.id)   # starting point consumed by the subsolution
                else:
                    break                # the other points cannot have the same pref
            
            # Yield the next subsolution.
            yield [ startPt ] + subLst
 
 
def islandSubsolutions(n, isle, mBFull):
    # n is just a number to identify the island in the output.
    # isle is list of points.
    # mBFull is the full boolean matrix.
    
    fname = 'isle%02d.txt' % n
    fout = open(fname, 'w')
 
    # Display the non-reduced island.
    fout.write(islandInfoStr(n, isle))
    fout.write('\n')
 
    # Display the boolean matrix for the island.
    m = subMatrix(mBFull, isle)
    boolMatrixToFile(m, fout)
    fout.write('\n')
 
    # Now, get the subsolutions for the island. The points
    # must be sorted to process the points with higher preference
    # (i.e. smaller .n) earlier.
    isle.sort(cmpPoints)
    for x, subLst in enumerate(subsolutionGenerator(isle, mBFull)):
        fout.write('Subsolution %d: ' % x)
        idLst = [ str(pt) for pt in subLst ]
        fout.write(', '.join(idLst))
        fout.write('\n')
 
    fout.close()
        
    
if __name__ == '__main__':
 
    distance = 16100
    mN, mB, points = buildMatrices(inputFname, distance)
    singles, others = separateSinglesFromIslands(mB, points)
    
    singlesToFile(singles, 'singles.txt')
 
    m1 = subMatrix(mB, others)
    m2 = transitiveClosure(copyMatrix(m1))
 
    islands = getIslands(m2, others)
    islandsToFile(islands, 'islands.txt')
 
    for n, isle in enumerate(islands):
        islandSubsolutions(n, isle, mB)
           
         

Open in New Window Select All

info8.zip (11 KB) (File Type Details)

single.txt, islands.txt, isleXX.txt with subsolutions