Commit a6a2320a authored by Alexander Butyaev's avatar Alexander Butyaev
Browse files

added subspace clustering with models.py as a reference

parents
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
\ No newline at end of file
# ColonyB Data Analysis Code
This is a repository that contains scripts and files related to ColonyB data analysis.
## Setup
Create python3 environment
```bash
python -m venv venv
source venv/bin/activate
```
and then install dependencies:
```bash
pip install -r requirements.txt
```
Code contains references to the database such as
```python
from db_app.models import *
```
File `src/models.py` is added as a reference and can be used to backtrack complete system.
asgiref==3.4.1
autopep8==1.5.7
decorator==4.4.2
Django==3.2.5
djangorestframework==3.12.4
networkx==2.5.1
numpy==1.19.5
pycodestyle==2.7.0
pytz==2021.1
scipy==1.5.4
sqlparse==0.4.1
toml==0.10.2
typing-extensions==3.10.0.0
This diff is collapsed.
This diff is collapsed.
import numpy as np
class Dense_Unit:
# class-data structure for the primitive unit that will be used later everywhere
def __init__(self, units, points=set()):
# self.axes = axes # TBD - to delete???
self.units = units # units - list of Primitive_Units
# find points in the dense unit (for simplification uses just custom points)
self.points = points if points else set.intersection(
*(u.points for u in units))
def merge(self, dense_unit2, density_th, point_weights=[], simplified=False):
'''
Merge two dense units. Returns None in case of unacceptance / error
'''
if not simplified and set(self.units[:-1]) != set(dense_unit2.units[:-1]):
return None
prob_points = self.points & dense_unit2.points
# define density
density = sum([point_weights[p] for p in prob_points]
) if point_weights else len(prob_points)
if density >= density_th:
units = sorted(set(self.units) | set(
dense_unit2.units), key=lambda x: x.axis)
return Dense_Unit(units=units, points=prob_points), density
return None, density
def __str__(self):
return "%s , %s" % (self.units, len(self.points))
class Primitive_Unit:
# class-data structure for the primitive unit that will be used later everywhere
def __init__(self, iid, begin, end, does_include, axis, points=set()):
self.iid = iid
self.axis = axis
self.begin = begin
self.end = end
self.does_include = does_include
self.points = points
def is_dense(self, density_th, point_weights=[]):
if point_weights:
# unit_density = sum([point_weights.get(point, 1) for point in self.points])
unit_density = sum([point_weights[point] for point in self.points])
else:
unit_density = len(self.points)
return unit_density >= density_th
def __str__(self):
close_bracket = "]" if self.does_include else ")"
return "axis %s [%s,%s%s; %s points" % (self.axis, self.begin, self.end, close_bracket, len(self.points))
class Subspace:
def __init__(self, subspace1, subspace2, density_th, point_weights=[], simplified=False, verbal=False):
self.verbal = verbal
self.axes = sorted(set(subspace1.axes) | set(subspace2.axes))
self.missed_densities = {}
self.dense_areas = self.set_dense_areas(subspace1.get_dense_areas(
), subspace2.get_dense_areas(), density_th, point_weights, simplified=simplified)
self.set_coverage()
def set_dense_areas(self, dense_areas1, dense_areas2, density_th, point_weights=[], simplified=False):
# for all possible combinations of units - find dense areas with
dense_areas = []
composition_densities = []
missed_densities = []
for da1 in dense_areas1:
for da2 in dense_areas2:
dd = da1.merge(da2, density_th, point_weights,
simplified=simplified)
if dd:
dense_unit, density = dd
if dense_unit:
composition_densities.append(density)
dense_areas.append(dense_unit)
else:
missed_densities.append(density)
if missed_densities:
self.missed_densities = {
"min": min(missed_densities),
"max": max(missed_densities),
"mean": np.mean(missed_densities),
"median": np.median(missed_densities),
"std": np.std(missed_densities),
"count": len(missed_densities),
"total": len(dense_areas1)*len(dense_areas2),
"axes": self.axes
}
if self.verbal and composition_densities:
print("Min %s; Max %s; mean %s; std %s; median %s"
% (np.min(composition_densities), np.max(composition_densities),
np.mean(composition_densities), np.std(
composition_densities),
np.median(composition_densities)))
return dense_areas
def get_dense_areas(self):
return self.dense_areas
def is_mergeable(self, subspace2, simplified=False):
if simplified:
return not set(self.axes) & set(subspace2.axes)
return set(self.axes[:-1]) == set(subspace2.axes[:-1])
def merge(self, subspace2, density_th, point_weights=[], simplified=False):
if not self.is_mergeable(subspace2, simplified=simplified):
print("NOT MERGEBLE ", str(self.axes), str(subspace2.axes))
return None
# be sure about the order
if simplified or self.axes[-1] < subspace2.axes[-1]:
return Subspace(self, subspace2, density_th, point_weights, simplified=simplified)
return Subspace(subspace2, self, density_th, point_weights)
def is_valid(self):
'''
contains at least one dense area
'''
return len(self.dense_areas) > 0
def set_coverage(self):
self.coverage = sum([len(da.points) for da in self.dense_areas])
def get_coverage(self):
return self.coverage
def __str__(self):
return "axes %s dense_areas : %s" % (self.axes, ','.join([str(d) for d in self.dense_areas]))
class Primitive_Subspace(Subspace):
def __init__(self, id, coordinates, partitions_number, density_th, point_weights=[], points=None, custom_min=None, custom_max=None, verbal=False):
'''
Subspace() initiates the primitive subspace - 1 dimensional subspace, partitions it and finds dense cells
'''
self.iid = id
self.axes = [id, ]
self.verbal = verbal
self.primitive_units = self.set_primitive_units(
id, coordinates, partitions_number, density_th, point_weights=point_weights, points=points, custom_min=custom_min, custom_max=custom_max)
self.dense_areas = self.set_dense_areas()
self.set_coverage()
self.is_dummy = False
self.missed_densities = {}
def set_primitive_units(self, subspace_id, coordinates, partitions_number, density_th, point_weights=[], points=None, custom_min=None, custom_max=None):
mmax = custom_max if custom_max is not None else max(coordinates)
mmin = custom_min if custom_min is not None else min(coordinates)
# include the last point - @TBD
# mmax += ( mmax - mmin ) / partitions_number / 100 # just to be sure
partition_size = (mmax - mmin) / partitions_number
partitions = self.get_partitions(
subspace_id, mmin, mmax, partitions_number)
# mapp points with coordinate.
# use abstract enumerate by default and specific points if provided
point_coord_list = zip(
points, coordinates) if points else enumerate(coordinates)
# to avoid too much interesections
temp_dist_parition = [[] for _ in range(partitions_number)]
for point, coord in point_coord_list:
partition_id = min(
int((coord - mmin) / partition_size), partitions_number - 1)
temp_dist_parition[partition_id].append(point)
for partition_id in range(partitions_number):
partitions[partition_id].points = set(
temp_dist_parition[partition_id])
# filter only partitions - which are dense.
return [_part for _part in partitions if _part.is_dense(density_th, point_weights)]
def get_partitions(self, subspace_id, mmin, mmax, partitions_number):
partition_size = (mmax - mmin) / partitions_number
return [
Primitive_Unit(
iid=i,
begin=mmin + i * partition_size,
end=mmin + (i + 1) * partition_size,
does_include=i == partitions_number - 1,
axis=subspace_id
)
for i in range(partitions_number)
]
def get_primitive_units(self):
return self.primitive_units
def get_primitive_units_dict(self):
return {prim.iid: prim for prim in self.primitive_units}
def set_dense_areas(self):
temp_prims = self.primitive_units
return [
Dense_Unit(units=[prim, ], points=prim.points)
for prim in temp_prims
]
class Dummy_Primitive_Subspace(Primitive_Subspace):
def __init__(self, id, coordinates, partitions_number, density_th, point_weights=[], points=None, custom_min=None, custom_max=None, verbal=False):
'''
Subspace() initiates the primitive subspace - 1 dimensional subspace, partitions it and finds dense cells
'''
self.iid = id
self.axes = [id, ]
self.verbal = verbal
# we switch density_th to 1 to detect everything in 1D space (@TBD: might be implemented more efficiently though)
# point_weights to [] to exclude any bias
# both parameters in header of function are inherited and will stay there.
self.primitive_units = self.set_primitive_units(id, coordinates, partitions_number, 1, point_weights=[
], points=points, custom_min=custom_min, custom_max=custom_max)
self.dense_areas = self.set_dense_areas()
self.set_coverage()
self.is_dummy = True
self.missed_densities = {}
This diff is collapsed.
from numpy import transpose
from db_app.models import *
from .clique_mod import Clique_Hardware, Clique_Weight_Class
from .clique_engine import Clique
def get_useless_non_labeled_data_indices(clique_results):
if not clique_results:
return None
result_number = len(clique_results)
point_number = len(clique_results[0])
cut_indices = []
for p in range(point_number):
if sum([d[p] for d in clique_results])/result_number == -1:
cut_indices.append(p)
return cut_indices
problem_id = 7
# extract all necessary data from database related to problem = problem_id
clique_hw = Clique_Hardware(problem_id = problem_id)
# get data itself
data = clique_hw.get_multidimensional_data()
# get data dimensionality
dimensionality = clique_hw.get_dimensionality()
unseenF=lambda x:0 #(lambda x:sum(x)/len(x))
# parameters to iterate for creation of configuration
params = [
# {"clique_version" : 0, "th": list(range(3,33,2)) },
# {"clique_version" : 1, "th": list(range(3,33,2)) },
# {"clique_version" : 2, "user_score_threshold" : 90, "th": list(range(3,33,2))},
# {"clique_version" : 3, "th": list(range(3,33,2))},
# {"clique_version" : 4, "avg_point_amount_rate_range" : (0.5, 0.7), "th": list(range(3,33,2))},
# {"clique_version" : 5, "avg_point_amount_rate_range" : (0.5, 0.7), "th": list(range(3,33,2))},
{"clique_version" : 6, "th": list(range(3,33,2)), "weightUnseenSP":unseenF},
]
# create weights for every point based on given configurations (params)
# takes some time...
# should be done only once to iterate after this over the results for different Clique modifications
clique_weights = Clique_Weight_Class(problem_id, dimensionality, params = params)
partitions_number = 50
clique_results = []
for param in params:
clique_version = param["clique_version"]
user_score_threshold = param["user_score_threshold"] if "user_score_threshold" in param else 90
avg_point_amount_rate_range = param["avg_point_amount_rate_range"] if "avg_point_amount_rate_range" in param else (0.5, 0.7)
for density_th in param["th"]:
point_weights = clique_weights.get_point_weight(
clique_version, problem_id,dimensionality, user_score_threshold = user_score_threshold, avg_point_amount_rate_range = avg_point_amount_rate_range
)
# Run clique itself
clique = Clique(data,
partitions_number,
density_th,
simplified = True,
point_weights = point_weights[0] if clique_version == 6 else point_weights,
kvargs = {}) # kvargs could be used to shift the beginning and end of the 1D interval (by default calculated based on the distribution)
clique_results.append( clique.get_clusters_vector() )
counter = set(clique.get_clusters_vector())
print("Clique #%s with density_threshold = %s contains %s clusters." % (clique_version, density_th, len(counter) ) )
cut_indices = get_useless_non_labeled_data_indices(clique_results)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment