perl 删除循环中的多维哈希

z0qdvdin  于 2022-11-15  发布在  Perl
关注(0)|答案(2)|浏览(185)

我正在做一个层次聚类,在进入下一步之前,我需要删除聚类的元素。我做了一维哈希的代码,运行得很好。现在我有了二维哈希,我无法删除元素。

use strict;
use Data::Dumper;

my %hash = (
    'S1' => {
        'A1' => 10,
        'A2' => 11,
        'A3' => 5,
    },
    'S2' => {
        'A1' => 6,
        'A2' => 8,
        'A3' => 3,
    },
    'S3' => {
        'A1' => 20,
        'A2' => 21,
        'A3' => 15,
    },
    'S4' => {
        'A1' => 7,
        'A2' => 6,
        'A3' => 4,
    },
    'S5' => {
        'A1' => 3,
        'A2' => 2,
        'A3' => 10,
    },
);

my @array = ('A1', 'A2', 'A3');

my %distances;
for my $key_1 (sort keys %hash) {
    for my $key_2 (sort keys %hash) {
        if ($key_1 ne $key_2) {
            my $deviation_vectors;
            foreach (@array) {
                $deviation_vectors += ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
            };
            $distances{$key_1}{$key_2} = $deviation_vectors ** 0.5 unless $distances{$key_2}{$key_1};
        };
    };
};
my @values;
while (my ($key, $element) = each %distances) {
    while (my ($element, $value) = each %{$element}) {
        push @values, $value;
    };
};
my $min = (sort {$a <=> $b} @values)[0];
for my $key_1 (sort keys %hash) {
    for my $key_2 (sort keys %hash) {
        if ($key_1 ne $key_2) {
            my $deviation_vectors;
            foreach (@array) {
                $deviation_vectors += ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
            };
            if ($min == $deviation_vectors ** 0.5) {
                my $new_key = "$key_1,$key_2";
                foreach (@array) {
                    $hash{$new_key}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_});
                };
# Problem here
# Delete doesn't completely remove the element, it returns a hash with an empty key element
                delete $hash{$key_1};
                delete $hash{$key_2};
            };
        };
    };
};
print Dumper \%hash;

sub mean {
    my @data = @_;
    my $sum;
    foreach (@data) {
        $sum += $_;
    };
    return ($sum / @data)
};

这是我得到的结果......

$VAR1 = {
          'S4' => {},
          'S2' => {},
          'S3' => {
                    'A1' => 20,
                    'A3' => 15,
                    'A2' => 21
                  },
          'S2,S4' => {
                       'A2' => 7,
                       'A1' => '6.5',
                       'A3' => '3.5'
                     },
          'S1' => {
                    'A3' => 5,
                    'A1' => 10,
                    'A2' => 11
                  },
          'S5' => {
                    'A3' => 10,
                    'A1' => 3,
                    'A2' => 2
                  }
        };

需要从哈希中完全删除“S2”和“S4”。

nkkqxpd9

nkkqxpd91#

请检查以下代码,该代码基于所提供的代码,并进行了一些 * 修改 *,以通过引入两个索引来删除过多的循环。
也许这个算法中的哈希值%distances太多了,它只是为了演示的目的而保留的,因为它可能对运算有用。
注意:提供的代码仅用于演示目的,旨在提高代码可读性
信息:$distance ** 0.5最好写成sqrt($distance),文档sqrt

use strict;
use warnings;
use feature 'say';

use Data::Dumper;

my %hash = (
    'S1' => {
        'A1' => 10,
        'A2' => 11,
        'A3' => 5,
    },
    'S2' => {
        'A1' => 6,
        'A2' => 8,
        'A3' => 3,
    },
    'S3' => {
        'A1' => 20,
        'A2' => 21,
        'A3' => 15,
    },
    'S4' => {
        'A1' => 7,
        'A2' => 6,
        'A3' => 4,
    },
    'S5' => {
        'A1' => 3,
        'A2' => 2,
        'A3' => 10,
    },
);

my(%distances, $deviation, @array, @keys);

@array = qw(A1 A2 A3);
@keys  = sort keys %hash;

for my $index_1 (0..$#keys) {
    for my $index_2 (1+$index_1..$#keys) {
        my($distance, $key_1, $key_2) = (0, $keys[$index_1], $keys[$index_2]);
        
        $distance += ( $hash{$key_1}{$_} - $hash{$key_2}{$_} ) ** 2 for @array;
        $distance = $distance ** 0.5;

        $distances{$key_1}{$key_2} = $distance;

        $deviation->{min} = $distance unless $deviation->{min};
         
        if( $deviation->{min} > $distance ) {
            $deviation->{min} = $distance;
            $deviation->{keys} = [$key_1, $key_2];
        }
    }
}

my($key_1, $key_2) = $deviation->{keys}->@*;

$hash{"$key_1,$key_2"}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_}) for @array;

delete @hash{($key_1, $key_2)};

say Dumper(\%hash);

exit 0;

sub mean {
    my @data = @_;
    my $sum;
    
    $sum += $_ for @data;
    
    return $sum / @data;
}

输出样本

$VAR1 = {
          'S2,S4' => {
                       'A1' => '6.5',
                       'A2' => '7',
                       'A3' => '3.5'
                     },
          'S5' => {
                    'A1' => 3,
                    'A3' => 10,
                    'A2' => 2
                  },
          'S3' => {
                    'A1' => 20,
                    'A3' => 15,
                    'A2' => 21
                  },
          'S1' => {
                    'A2' => 11,
                    'A3' => 5,
                    'A1' => 10
                  }
        };
nzk0hqpo

nzk0hqpo2#

这是我从@北极熊解决方案中创建的子程序,它有三个参数,第一个是输入数据,第二个是子元素数组,最后一个是我们想要停止子程序的阈值。

...
sub agglomerative_clustering {
    my %data = %{$_[0]};
    my @array = @{$_[1]};
    my $threshold = $_[2];
    my $size = keys %data;
    my %clusters;
    for (my $i = 1; $i < $size; $i++) {
        my (%distances, $find, @keys);
        @keys = sort keys %data;
        for my $index_1 (0 .. $#keys) {
            for my $index_2 (1 + $index_1 .. $#keys) {
                my ($distance, $key_1, $key_2) = (0, $keys[$index_1], $keys[$index_2]);
                $distance += ($data{$key_1}{$_} - $data{$key_2}{$_}) ** 2 foreach @array;
                $distance = sqrt($distance);
                $distances{$key_1}{$key_2} = $distance;
                $find->{min} = $distance unless $find->{min};
                $find->{key} = [$key_1, $key_2] unless $find->{key};
                if ($find->{min} > $distance) {
                    $find->{min} = $distance;
                    $find->{key} = [$key_1, $key_2];
                };
            };
        };
        my ($key_1, $key_2) = $find->{key}->@*;
        $data{"$key_1,$key_2"}{$_} = mean($data{$key_1}{$_}, $data{$key_2}{$_}) foreach @array;
        delete @data{($key_1, $key_2)};
        last if $find->{min} >= $threshold;
        %clusters = %data;
    };
    return %clusters;
};

sub mean {
    my @data = @_;
    my $sum;
    $sum += $_ for @data;    
    return $sum / @data;
};
...

相关问题