#include <assert.h>
#include <float.h>
#include "mbed.h"
#include "muscle_tree.h"
#include "seq.h"
#include "symmatrix.h"
#include "ktuple_pair.h"
#include "tree.h"
#include "util.h"
#include "progress.h"
#include "list.h"
#include "log.h"
#include "kmpp/KMeans.h"
#include "mbed.h"
Data Structures | |
struct | bisecting_kmeans_result_t |
Defines | |
#define | TIMING 0 |
#define | FULL_WITHIN_CLUSTER_DISTANCES 1 |
#define | COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0 |
#define | USE_KMEANS_LLOYDS 0 |
#define | log2(x) (log(x) / 0.69314718055994530942) |
#define | NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2) |
#define | SEED_SELECTION SELECT_SEEDS_BY_LENGTH |
#define | USE_EUCLIDEAN_DISTANCE 1 |
#define | PRINT_CLUSTER_DISTRIBUTION 0 |
#define | TRACE 0 |
Enumerations | |
enum | SEED_SELECTION_TYPE { SELECT_SEEDS_RANDOMLY, SELECT_SEEDS_BY_LENGTH } |
Functions | |
void | FreeKMeansResult (bisecting_kmeans_result_t **prResult_p) |
Free KMeans result structure. | |
void | NewKMeansResult (bisecting_kmeans_result_t **prKMeansResult_p) |
Allocate new KMeans result. | |
double | EuclDist (const double *v1, const double *v2, const int dim) |
Calculate the euclidean distance between two vectors. | |
double | CosDist (const double *v1, const double *v2, const int dim) |
Calculate the cosine distance between two vectors. | |
int | SeqToVec (double **ppdSeqVec, mseq_t *prMSeq, int *piSeeds, const int iNumSeeds, const int iPairDistType) |
convert sequences into mbed-like (distance) vector representation. Seeds (prMSeq sequence indices) have to be picked before | |
int | SeedSelection (int *piSeeds, int iNumSeeds, int iSelectionMethod, mseq_t *prMSeq) |
Select seeds to be used from an prMSeq. | |
void | BisectingKmeans (bisecting_kmeans_result_t **prKMeansResult_p, const int iNObjs, const int iDim, double **ppdVectors, const int iMinRequiredObjsPerCluster, const int iMaxAllowedObjsPerCluster) |
Bisecting K-Means clustering. Repeatedly calls K-Means with a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster. | |
int | Mbed (tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, const char *pcGuidetreeOut) |
From scratch reimplementation of mBed: Blackshields et al. (2010); PMID 20470396. |
#define COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0 |
#define FULL_WITHIN_CLUSTER_DISTANCES 1 |
#define log2 | ( | x | ) | (log(x) / 0.69314718055994530942) |
#define NUMBER_OF_SEEDS | ( | n | ) | pow(log2(((double)n)), 2) |
#define PRINT_CLUSTER_DISTRIBUTION 0 |
#define SEED_SELECTION SELECT_SEEDS_BY_LENGTH |
#define TIMING 0 |
#define TRACE 0 |
#define USE_EUCLIDEAN_DISTANCE 1 |
#define USE_KMEANS_LLOYDS 0 |
enum SEED_SELECTION_TYPE |
void BisectingKmeans | ( | bisecting_kmeans_result_t ** | prKMeansResult_p, | |
const int | iNObjs, | |||
const int | iDim, | |||
double ** | ppdVectors, | |||
const int | iMinRequiredObjsPerCluster, | |||
const int | iMaxAllowedObjsPerCluster | |||
) |
Bisecting K-Means clustering. Repeatedly calls K-Means with a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster.
[out] | prKMeansResult_p | Result of Bisecting KMeans. Will be allocated here. Caller has to free. See |
[in] | iNObjs | Number of objects/sequences to cluster |
[in] | iDim | Dimensionality of input data |
[in] | ppdVectors | each row holds iDim points for this object's coordinates |
[in] | iMinRequiredObjsPerCluster | Minimum number of objects per Cluster (inclusive)/ |
[in] | iMaxAllowedObjsPerCluster | Maximum number of objects per Cluster (inclusive). Function returns once no cluster contains more then this number of objects. Soft limit! |
double CosDist | ( | const double * | v1, | |
const double * | v2, | |||
const int | dim | |||
) |
Calculate the cosine distance between two vectors.
[in] | v1 | First vector with dim dimensions |
[in] | v2 | Second vector with dim dimensions |
[in] | dim | Dimensionality of v1 and v2 |
double EuclDist | ( | const double * | v1, | |
const double * | v2, | |||
const int | dim | |||
) |
Calculate the euclidean distance between two vectors.
[in] | v1 | First vector with dim dimensions |
[in] | v2 | Second vector with dim dimensions |
[in] | dim | Dimensionality of v1 and v2 |
void FreeKMeansResult | ( | bisecting_kmeans_result_t ** | prResult_p | ) |
Free KMeans result structure.
[out] | prResult_p | K-Means result to free |
int Mbed | ( | tree_t ** | prMbedTree_p, | |
mseq_t * | prMSeq, | |||
const int | iPairDistType, | |||
const char * | pcGuidetreeOut | |||
) |
From scratch reimplementation of mBed: Blackshields et al. (2010); PMID 20470396.
Idea is a follows:
[out] | prMbedTree_p | Created upgma tree. will be allocated here. use FreeMuscleTree() to free |
[in] | prMSeq | Multiple sequences |
[in] | iPairDistType | Distance measure for pairwise alignments |
[in] | pcGuidetreeOut | Passed down to GuideTreeUpgma() |
void NewKMeansResult | ( | bisecting_kmeans_result_t ** | prKMeansResult_p | ) |
Allocate new KMeans result.
[out] | prKMeansResult_p | K-Means result to free |
int SeedSelection | ( | int * | piSeeds, | |
int | iNumSeeds, | |||
int | iSelectionMethod, | |||
mseq_t * | prMSeq | |||
) |
Select seeds to be used from an prMSeq.
[out] | piSeeds | Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated. |
[in] | iNumSeeds | Number of seeds to be picked |
[in] | iSelectionMethod | Seed selection method to be used |
[in] | prMSeq | The prMSeq structure to pick sequences from |
int SeqToVec | ( | double ** | ppdSeqVec, | |
mseq_t * | prMSeq, | |||
int * | piSeeds, | |||
const int | iNumSeeds, | |||
const int | iPairDistType | |||
) |
convert sequences into mbed-like (distance) vector representation. Seeds (prMSeq sequence indices) have to be picked before
[out] | ppdSeqVec | Pointer to preallocated matrix of size nseqs x iSeeds |
[in] | prMSeq | Sequences which are to be converted |
[in] | piSeeds | Array of sequences in prMSeq which are to be used as seeds. |
[in] | iNumSeeds | Number of seeds/elements in piSeeds |
[in] | iPairDistType | Type of pairwise distance comparison |