Background... exploring the ArcGIS module and looking at how it worked which lead me to SpatialDataFrames, the cloaked version of a Pandas df with geometry stuff, which lead me to some of its inner workings etc etc.
I use Numpy and lot and the FeatureClassToNumPy array with some fancy array splitting generally satisfies all my needs to get the geometry yanked out of a featureclass... do the magic... then send it back to a featureclass.
I tend not to use searchcursors, but the ArcGIS code uses it in several places, so, I thought... give it a shot.
So... helper function ...
import sys
from textwrap import dedent
import numpy as np
import arcpy
from arcgis.geometry import _types
from arcgis.features._data.geodataset import SpatialDataFrame as SDF
import pandas as pd
def fc_info(in_fc):
"""Return basic featureclass information, including...
: SR - spatial reference object (use SR.name to get the name)
: shp_fld - field name which contains the geometry object
: oid_fld - the object index/id field name
: - others: 'areaFieldName', 'baseName', 'catalogPath','featureType',
: 'fields', 'hasOID', 'hasM', 'hasZ', 'path'
: - all_flds =[i.name for i in desc['fields']]
"""
desc = arcpy.da.Describe(in_fc)
args = ['shapeFieldName', 'OIDFieldName', 'spatialReference', 'shapeType']
shp_fld, oid_fld, SR, shp_type = [desc[i] for i in args]
return shp_fld, oid_fld, SR, shp_type
@time_deco
def to_arr(in_fc, use_geo=False):
"""Convert a featureclass to a structured or recarray using a searchcursor.
:
:Requires: import arcpy, numpy as np
:--------
: in_fc - featureclass
: use_geo - True .__geo_interface__
: - list comprehension
:get the row information
: cycle through all geometries and get xy pairs
:
:References:
:----------
: - see the polygon, polyline etc classes in
: C:\ArcPro\Resources\ArcPy\arcpy\arcobjects\geometry.py
"""
shp_fld, oid_fld, SR, shp_type = fc_info(in_fc)
flds = arcpy.ListFields(in_fc)
fnames = [f.name for f in flds if f.type not in ['OID', 'Geometry']]
geom_flds = ['SHAPE@', oid_fld] + fnames
flds = [shp_fld, oid_fld] + fnames
vals = []
geoms = []
coords = []
idx = flds.index(shp_fld)
with arcpy.da.SearchCursor(in_fc,
field_names=geom_flds, where_clause=None,
spatial_reference=SR, explode_to_points=False,
sql_clause=(None, None)) as rows:
for row in rows:
row = list(row)
geom = row.pop(idx)
vals.append(row)
geoms.append(geom)
if use_geo:
xy = geom.__geo_interface__['coordinates']
else:
xy = [np.array([(pt.X, pt.Y) for pt in arr if pt])
for arr in geom]
coords.append(np.asarray(xy))
del row, geom, xy
del rows
return vals, coords, geoms
Now on to the next '2'
def two_arrays(in_fc, keep_geom=True, dtype0=True, as_recarray=False):
"""Send to a numpy structured/array and split it into a geometry array
: and an attribute array.
:
:Requires:
:--------
: fc_info(in_fc) - function needed to return fc properties
: keep_geom
: - True, split the points into their geometry groups as an object array
: - False, a sequential array of shape = (N,)
: dtype0
: - True dt = [('Idx', '<i4'), ('Xs', '<f8'), ('Ys', '<f8')]
: - False dt = [('Idx', '<i4'), ('XY', '<f8', (2,))]
"""
shp_fld, oid_fld, SR, shp_type = fc_info(in_fc)
all_flds = arcpy.ListFields(in_fc)
b_flds = [i.name for i in all_flds]
a = arcpy.da.FeatureClassToNumPyArray(in_fc,
field_names=[oid_fld, shp_fld],
explode_to_points=True,
spatial_reference=SR)
if dtype0:
dt = [('Idx', '<i4'), ('Xs', '<f8'), ('Ys', '<f8')]
dtb = [('Idx', '<i4'), ('Xc', '<f8'), ('Yc', '<f8')]
else:
dt = [('Idx', '<i4'), ('XY', '<f8', (2,))]
dtb = [('Idx', '<i4'), ('XYc', '<f8', (2,))]
a.dtype = dt
oid_fld = 'Idx'
if keep_geom:
a = np.split(a, np.where(np.diff(a[oid_fld]))[0] + 1)
a = np.r_[a]
if as_recarray:
a = a.view(object, np.recarray)
b = arcpy.da.FeatureClassToNumPyArray(in_fc, field_names=b_flds,
explode_to_points=False,
spatial_reference=SR)
dtb.extend(b.dtype.descr[2:])
b.dtype = dtb
return a, b
Now testing on trivial samples is trivial. Before I moved on to big samples (geometries with millions of points), I tried it on a suitable one that was readily available...
a.size
a.shape
sum([i.size for i in a])
array([ array([(1, -65.55527496337874, 49.25722122192411),
(1, -65.55305480957014, 49.25694274902389), ...,
(1, -55.902732849121094, 51.63003158569353),
(1, -55.90134429931635, 51.63085556030313)],
dtype=[('Idx', '<i4'), ('Xs', '<f8'), ('Ys', '<f8')]),
array([(2, -81.8187484741211, 68.89791870117205),
(2, -81.81718444824207, 68.89531707763712), ...,
(2, -99.49025726318348, 51.53623199462936),
(2, -99.49021148681629, 51.536178588867415)],
dtype=[('Idx', '<i4'), ('Xs', '<f8'), ('Ys', '<f8')]),
array([(3, -114.51197814941384, 73.37343597412138),
(3, -114.5083312988279, 73.37291717529314), ...,
(3, -69.89323425292969, 83.10884857177757),
(3, -69.89251708984364, 83.10861206054705)],
dtype=[('Idx', '<i4'), ('Xs', '<f8'), ('Ys', '<f8')])], dtype=object)
Timing results
Timing function for... to_arr
Time: 4.63e+01s for 3 objects # 46 seconds
_ = two_arrays(in_fc)
Timing function for... two_arrays
Time: 4.45e+00s for 2 objects # 4.5 seconds
I kept the scripts intact for people to try out on their data. I even went as far as to strip out anything unnecessary in other versions but nothing reduced the time of the searchcursor approach. I am trying to get a handle on 'why'. I would have guessed that a similar base code would have been recycled in the searchcursor and the FeatureClassToNumPyArray functionality. I guess I am wrong OR there is some fundamental flaw in my implementation of 'to_arr' vs 'two_arrays'... so if you have a moment or two... have a look-see
Dan