私はこの解決策に行きました
--------------------------------
1.ホスト名ごとにすべての最小+最大のlast_update、sum(interval_avg)、sum( numb_update )、およびcount(duplicates)を選択します
//This will get the interval_avg value
//(summarize is ok, since all except min(id) will be zero),
//give a count of how many duplicates there are per hostname,
//and will also summarize numb_updates
SELECT
MIN(id) as id,
hostname,
SUM(numb_updates) as total_numb_updates,
SUM(interval_avg) as total_interval_avg,
MAX(last_update) as last_update_max,
MIN(last_update) as last_update_min,
COUNT(*) as hostname_count
FROM
hostname_table
GROUP BY
hostname
HAVING
COUNT(*)>1
//Get all last_update from each duplicate hostname(including the original)
//Dont do this in a seperate query, you only need first+last+rowcount to figure
//out the interval average. It took me a while to realize this, so I tried many
//varieties with little success(took too long with +600k rows)
//
// --- I will include the solution I didn't go for, ---
// --- so others wont do the same mistake ---
//
// START DONT USE THIS
// 2.63sec @ 10000 rows
$sql = "SELECT
id,
".$db_table.".hostname,
last_update
FROM
".$db_table."
INNER JOIN (
SELECT
hostname,
COUNT(*)
FROM
".$db_table."
GROUP BY
hostname
HAVING
COUNT(*)>1
) as t2
ON
".$db_table.".hostname = t2.hostname";
$resource = mysql_query($sql,$con);
// END DONT USE THIS (below is a 94% improvement)
//
// START THIS IS BETTER, BUT DONT USE THIS
// 0.16 sec @ 10000 rows
//Select everything from the table
$sql = "SELECT id
FROM ".$db_table;
$resource = mysql_query($sql,$con);
$array_id_all = array();
while($assoc = mysql_fetch_assoc($resource)){
array_push($array_id_all, $assoc['id']);
}
//This will select the ID of all the hosts without duplicates
$sql = "SELECT
MIN(id) as id,
hostname
FROM
".$db_table."
GROUP BY
hostname
HAVING
COUNT(*)=1";
$resource = mysql_query($sql,$con);
$array_id_unique = array();
while($assoc = mysql_fetch_assoc($resource)){
array_push($array_id_unique, $assoc['id']);
}
$array_id_non_unique = array_diff($array_id_all, $array_id_unique);
$id_list_non_unique = implode(", ", $array_id_non_unique);
//Select everything from the table when the IDs are IN $id_list_non_unique
$sql = "SELECT *
FROM ".$db_table."
WHERE id IN (".$id_list_non_unique.")";
$resource = mysql_query($sql,$con);
$array_duplicates = array();
$i=0;
while($assoc = mysql_fetch_assoc($resource)){
$array_duplicates[$i] = array($assoc['id'], $assoc['hostname'], $assoc['interval_avg'], $assoc['last_update'], $assoc['numb_updates']);
$i++;
}
// END THIS IS BETTER, BUT DONT USE THIS
(Nick Fortescue @ https://stackoverflow.com/a/877051/1248273に感謝)
2. min(id) の interval_avg を更新し、min(id) のnumb_updatesを更新し、min(id) の last_update を max(id) の値で更新します。
//update the interval_avg, last_update and numb_update value of the min(id)
//of each duplicate hostname.
// --- I will include the solution I didn't go for, ---
// --- so others wont do the same mistake ---
//
// START DONT USE THIS
// 167 secs @ 500k rows
UPDATE hostname_table
SET interval_avg = CASE id
WHEN 1 THEN 25
//etc
END,
last_update = CASE id
WHEN 1 THEN "2012-04-25 20:22:36"
//etc
END,
numb_update = CASE id
WHEN 1 THEN 3
//etc
END
WHERE id IN (1)
// END DONT USE THIS
//
// START USE THIS
// 5.75 secs @ 500k rows (96.6% improvement)
INSERT INTO hostname_table (id,interval_avg,last_update,numb_updates)
VALUES
('1','25','2012-04-25 20:22:36','3'),
//etc
ON DUPLICATE KEY UPDATE
interval_avg=VALUES(interval_avg),
last_update=VALUES(last_update),
numb_updates=VALUES(numb_updates)
// END USE THIS
(Michiel de Mare @ https://stackoverflow.com/a/3466/1248273に感謝)
3. min(id) を除くすべての重複を削除します
//delete all duplicates except min(id)
ALTER IGNORE TABLE hostname_table ADD UNIQUE (hostname)
ALTER TABLE hostname_table DROP INDEX hostname
(必要な最初の情報を選択する際に正しい方向に進んでくれた GolezTrol に感謝します)