私はここで多くの助けを得て、そのすべての情報を使って、そのような解決策をまとめることができました. ただし、重みの低い名前が重複する結果になった何かを台無しにした可能性があります。より高い重みでマークした一般名によると、フルネームのより均等に分散されたリストが必要です。重複は問題ありませんが、おそらく TOP 名 + TOP 姓の組み合わせになります。それは私が観察している行動ではありません。
SELECT count(full_name) as total, full_name FROM _get_sample_data(20000) group by (full_name) order by total DESC
上記の SELECT には最大 13 個の名前の繰り返しがあり、それらの名前の繰り返しは低い重みでマークされています。
関数:
CREATE
OR REPLACE FUNCTION _get_sample_data(p_number_of_records INT) RETURNS TABLE(
full_name TEXT,
email TEXT,
phone TEXT,
document TEXT,
date_of_birth DATE
) AS $$
WITH first_names_weighted AS
(
SELECT
first_name,
SUM(ratio) OVER (
ORDER BY
first_name) - ratio AS lower_bound,
SUM(ratio) OVER (
ORDER BY
first_name) AS upper_bound
FROM
common_first_name
),
middle_names_weighted AS
(
SELECT
first_name AS middle_name,
SUM(ratio) OVER (
ORDER BY
first_name) - ratio AS lower_bound,
SUM(ratio) OVER (
ORDER BY
first_name) AS upper_bound
FROM
common_first_name
),
last_names_weighted AS
(
SELECT
last_name,
SUM(ratio) OVER (
ORDER BY
last_name) - ratio AS lower_bound,
SUM(ratio) OVER (
ORDER BY
last_name) AS upper_bound
FROM
common_last_name
),
email_domain_weighted AS
(
SELECT
domain_name,
SUM(ratio) OVER (
ORDER BY
domain_name ) - ratio AS lower_bound,
SUM(ratio) OVER (
ORDER BY
domain_name ) AS upper_bound
FROM
common_email_domain
),
document_type_weighted AS
(
SELECT
country_iso_document_type,
SUM(ratio) OVER (
ORDER BY
country_iso_document_type ) - ratio AS lower_bound,
SUM(ratio) OVER (
ORDER BY
country_iso_document_type ) AS upper_bound
FROM
common_document_type
),
year_of_birth_weighted AS
(
SELECT
YEAR,
SUM(ratio) OVER (
ORDER BY
YEAR ) - ratio AS lower_bound,
SUM(ratio) OVER (
ORDER BY
YEAR ) AS upper_bound
FROM
common_year_of_birth
),
randoms AS
(
SELECT
random() * (
SELECT
SUM(ratio)
FROM
common_first_name ) AS f_random,
random() * (
SELECT
SUM(ratio)
FROM
common_first_name ) AS m_random,
random() * (
SELECT
SUM(ratio)
FROM
common_last_name ) AS l_random,
random() * (
SELECT
SUM(ratio)
FROM
common_email_domain ) AS e_random,
random() * (
SELECT
SUM(ratio)
FROM
common_document_type ) AS d_random,
random() * (
SELECT
SUM(ratio)
FROM
common_year_of_birth ) AS y_random
FROM
generate_series(1, p_number_of_records )
)
SELECT
--r,
BTRIM(first_name)||' '||
CASE
WHEN
random() < 0.5
THEN
''
ELSE
BTRIM(middle_name)||' '
END
||BTRIM(last_name)||
CASE
WHEN
random() < 0.9
THEN
''
ELSE
CASE
WHEN
random() < 0.1
THEN
' Junior'
ELSE
CASE
WHEN
random() < 0.1
THEN
' II'
ELSE
CASE
WHEN
random() < 0.1
THEN
' III'
ELSE
CASE
WHEN
random() < 0.1
THEN
' IV'
ELSE
' Jr.'
END
END
END
END
END
AS full_name, --TODO: Honorif and Suffix
LOWER( SUBSTRING( first_name, 1, 3 + (random() * (length(first_name) - 3))::INTEGER) ||
CASE
WHEN
random() < 0.7
THEN
'.'
ELSE
CASE
WHEN
random() > 0.5
THEN
'_'
ELSE
''
END
END
|| SUBSTRING( last_name, 1, 3 + (random() * (length(last_name) - 3))::INTEGER)||
CASE
WHEN
random() > 0.5
THEN
'.'
ELSE
''
END
||
round((1800 + random()*1200))::TEXT || '@' || domain_name )
AS email,
'+1 ' || lpad(round(random() * 999)::text, 3, '0') || ' ' || lpad(round(random() * 999)::text, 3, '0') || ' ' || lpad(round(random() * 9999)::text, 4, '0') AS phone,
CASE
WHEN
POSITION('.sin' IN country_iso_document_type) > 1
THEN
country_iso_document_type || '.' || lpad((round(random() * 999))::TEXT, 3, '0') || '-' || lpad((round(random() * 999))::TEXT, 3, '0') || '-' || lpad((round(random() * 999))::TEXT, 3, '0')
ELSE
CASE
WHEN
POSITION('.driver' IN country_iso_document_type) > 1
THEN
country_iso_document_type || '.' || chr((65 + random()*25)::INT) || lpad(CAST(round(random() * 99999) AS text), 5, '0') || '-' || lpad((round(random() * 99999))::TEXT, 5, '0') || '-' || lpad((round(random() * 99999))::TEXT, 5, '0')
ELSE
CASE
WHEN
POSITION('.passport' IN country_iso_document_type) > 1
THEN
country_iso_document_type || '.' || chr((65 + random()*25)::INT) || chr((65 + random()*25)::INT) || lpad((round(random() * 999999999999))::TEXT, 12, '0')
ELSE
NULL
END
END
END
AS document,
make_date(YEAR, (round(1 + random() * 11))::INTEGER, 1) + make_interval(days => (round(1 + random() * 30))::INTEGER) AS dob
FROM
randoms r
CROSS JOIN
first_names_weighted f
CROSS JOIN
last_names_weighted l
CROSS JOIN
email_domain_weighted e
CROSS JOIN
document_type_weighted d
CROSS JOIN
year_of_birth_weighted y
CROSS JOIN
middle_names_weighted m
WHERE
f.lower_bound <= r.f_random
AND r.f_random <= f.upper_bound
AND l.lower_bound <= r.l_random
AND r.l_random <= l.upper_bound
AND e.lower_bound <= r.e_random
AND r.e_random <= e.upper_bound
AND d.lower_bound <= r.d_random
AND r.d_random <= d.upper_bound
AND y.lower_bound <= r.y_random
AND r.y_random <= y.upper_bound
AND m.lower_bound <= r.m_random
AND r.m_random <= m.upper_bound;
$$ language sql;
私は 1000 の最も一般的な名前と 1000 の最も一般的な姓 (インターネットから入手したもの) を持っています。ジョンはもっと頻繁に出るべきですが、そうではありません。
データは生成されていますが、どういうわけか、一般的な名前のテーブルに配置した重みが反映されていません。
PS: 誰かが役に立つと思ったら、テーブル構造と SAMPLE DATA を共有できます。
私はsupabase.io(PG 13.3)を使用しています
どうぞ、どんな助けでも大歓迎です。
ありがとう