0

私はここで多くの助けを得て、そのすべての情報を使って、そのような解決策をまとめることができました. ただし、重みの低い名前が重複する結果になった何かを台無しにした可能性があります。より高い重みでマークした一般名によると、フルネームのより均等に分散されたリストが必要です。重複は問題ありませんが、おそらく TOP 名 + TOP 姓の組み合わせになります。それは私が観察している行動ではありません。

SELECT count(full_name) as total, full_name FROM _get_sample_data(20000) group by (full_name) order by total DESC

上記の SELECT には最大 13 個の名前の繰り返しがあり、それらの名前の繰り返しは低い重みでマークされています。

関数:

CREATE
OR REPLACE FUNCTION _get_sample_data(p_number_of_records INT) RETURNS TABLE(
  full_name TEXT,
  email TEXT,
  phone TEXT,
  document TEXT,
  date_of_birth DATE
) AS $$
WITH first_names_weighted AS 
(
    SELECT
        first_name,
        SUM(ratio) OVER (
    ORDER BY
        first_name) - ratio AS lower_bound,
        SUM(ratio) OVER (
    ORDER BY
        first_name) AS upper_bound 
    FROM
        common_first_name 
),
middle_names_weighted AS 
(
    SELECT
        first_name AS middle_name,
        SUM(ratio) OVER (
    ORDER BY
        first_name) - ratio AS lower_bound,
        SUM(ratio) OVER (
    ORDER BY
        first_name) AS upper_bound 
    FROM
        common_first_name 
),
last_names_weighted AS 
(
    SELECT
        last_name,
        SUM(ratio) OVER (
    ORDER BY
        last_name) - ratio AS lower_bound,
        SUM(ratio) OVER (
    ORDER BY
        last_name) AS upper_bound 
    FROM
        common_last_name 
),
email_domain_weighted AS 
(
    SELECT
        domain_name,
        SUM(ratio) OVER ( 
    ORDER BY
        domain_name ) - ratio AS lower_bound,
        SUM(ratio) OVER ( 
    ORDER BY
        domain_name ) AS upper_bound 
    FROM
        common_email_domain 
),
document_type_weighted AS 
(
    SELECT
        country_iso_document_type,
        SUM(ratio) OVER ( 
    ORDER BY
        country_iso_document_type ) - ratio AS lower_bound,
        SUM(ratio) OVER ( 
    ORDER BY
        country_iso_document_type ) AS upper_bound 
    FROM
        common_document_type 
),
year_of_birth_weighted AS 
(
    SELECT
        YEAR,
        SUM(ratio) OVER ( 
    ORDER BY
        YEAR ) - ratio AS lower_bound,
        SUM(ratio) OVER ( 
    ORDER BY
        YEAR ) AS upper_bound 
    FROM
        common_year_of_birth 
),
randoms AS 
(
    SELECT
        random() * ( 
        SELECT
            SUM(ratio) 
        FROM
            common_first_name ) AS f_random,
            random() * ( 
            SELECT
                SUM(ratio) 
            FROM
            common_first_name ) AS m_random,
            random() * ( 
            SELECT
                SUM(ratio) 
            FROM
                common_last_name ) AS l_random,
                random() * ( 
                SELECT
                    SUM(ratio) 
                FROM
                    common_email_domain ) AS e_random,
                    random() * ( 
                    SELECT
                        SUM(ratio) 
                    FROM
                        common_document_type ) AS d_random,
                        random() * ( 
                        SELECT
                            SUM(ratio) 
                        FROM
                            common_year_of_birth ) AS y_random 
                        FROM
                            generate_series(1, p_number_of_records ) 
)
SELECT
    --r, 
    BTRIM(first_name)||' '||
        CASE
        WHEN
            random() < 0.5 
        THEN
            ''
        ELSE
          BTRIM(middle_name)||' '
        END
    ||BTRIM(last_name)||
        CASE
        WHEN
            random() < 0.9 
        THEN
            ''
        ELSE
            CASE
            WHEN
                random() < 0.1 
            THEN
                ' Junior'
            ELSE
                CASE
                WHEN
                    random() < 0.1 
                THEN
                    ' II'
                ELSE
                    CASE
                    WHEN
                        random() < 0.1 
                    THEN
                        ' III'
                    ELSE
                        CASE
                        WHEN
                            random() < 0.1 
                        THEN
                            ' IV'
                        ELSE
                            ' Jr.'
                        END                      
                    END                  
                END               
            END           
        END    
    AS full_name, --TODO: Honorif and Suffix
    LOWER( SUBSTRING( first_name, 1, 3 + (random() * (length(first_name) - 3))::INTEGER) || 
        CASE
        WHEN
            random() < 0.7 
        THEN
            '.'
        ELSE
            CASE         
            WHEN
                random() > 0.5 
            THEN
                '_'
            ELSE
                ''
            END
        END
    || SUBSTRING( last_name, 1, 3 + (random() * (length(last_name) - 3))::INTEGER)|| 
                CASE         
            WHEN
                random() > 0.5 
            THEN
                '.'
            ELSE
                ''
            END
            ||
            round((1800 + random()*1200))::TEXT || '@' || domain_name ) 
    AS email,
    '+1 ' || lpad(round(random() * 999)::text, 3, '0') || ' ' || lpad(round(random() * 999)::text, 3, '0') || ' ' || lpad(round(random() * 9999)::text, 4, '0') AS phone,
    CASE
        WHEN
            POSITION('.sin' IN country_iso_document_type) > 1 
        THEN
            country_iso_document_type || '.' || lpad((round(random() * 999))::TEXT, 3, '0') || '-' || lpad((round(random() * 999))::TEXT, 3, '0') || '-' || lpad((round(random() * 999))::TEXT, 3, '0') 
        ELSE
            CASE
                WHEN
                    POSITION('.driver' IN country_iso_document_type) > 1 
                THEN
                    country_iso_document_type || '.' || chr((65 + random()*25)::INT) || lpad(CAST(round(random() * 99999) AS text), 5, '0') || '-' || lpad((round(random() * 99999))::TEXT, 5, '0') || '-' || lpad((round(random() * 99999))::TEXT, 5, '0') 
                ELSE
                    CASE
                        WHEN
                            POSITION('.passport' IN country_iso_document_type) > 1 
                        THEN
                            country_iso_document_type || '.' || chr((65 + random()*25)::INT) || chr((65 + random()*25)::INT) || lpad((round(random() * 999999999999))::TEXT, 12, '0') 
                        ELSE
                            NULL 
                    END
            END
    END
    AS document,
    make_date(YEAR, (round(1 + random() * 11))::INTEGER, 1) + make_interval(days => (round(1 + random() * 30))::INTEGER) AS dob 
FROM
    randoms r 
    CROSS JOIN
        first_names_weighted f 
    CROSS JOIN
        last_names_weighted l 
    CROSS JOIN
        email_domain_weighted e 
    CROSS JOIN
        document_type_weighted d 
    CROSS JOIN
        year_of_birth_weighted y 
    CROSS JOIN
        middle_names_weighted m              
WHERE
    f.lower_bound <= r.f_random 
    AND r.f_random <= f.upper_bound 
    AND l.lower_bound <= r.l_random 
    AND r.l_random <= l.upper_bound 
    AND e.lower_bound <= r.e_random 
    AND r.e_random <= e.upper_bound 
    AND d.lower_bound <= r.d_random 
    AND r.d_random <= d.upper_bound 
    AND y.lower_bound <= r.y_random 
    AND r.y_random <= y.upper_bound
    AND m.lower_bound <= r.m_random 
    AND r.m_random <= m.upper_bound;
 
$$ language sql; 

私は 1000 の最も一般的な名前と 1000 の最も一般的な姓 (インターネットから入手したもの) を持っています。ジョンはもっと頻繁に出るべきですが、そうではありません。

データは生成されていますが、どういうわけか、一般的な名前のテーブルに配置した重みが反映されていません。

PS: 誰かが役に立つと思ったら、テーブル構造と SAMPLE DATA を共有できます。

私はsupabase.io(PG 13.3)を使用しています

どうぞ、どんな助けでも大歓迎です。

ありがとう

4

0 に答える 0