#!/usr/bin/perl

# Writes a sample /etc/locale.gen file to standard output as a columnated list,
# consisting only of UTF-8 locales supported by the installed version of glibc,
# with comments indicating the languages and territories in plain English.
#
# Requires: column(1)
#
# Copyright 2025 Kerin Millar <kfm@plushkava.net>
# License GPL-2.0-only <https://spdx.org/licenses/GPL-2.0-only.html>

use v5.36;
use File::Spec::Functions qw(catdir catfile);
use Unicode::Normalize qw(NFKD);

use File::Slurper qw(read_lines read_text);

{
	# The first argument shall be treated as a prefix, if any.
	my $prefix = @ARGV ? $ARGV[0] : '';

	# Read the file containing the supported locale/charmap combinations.
	my $path = catfile($prefix, '/usr/share/i18n', 'SUPPORTED');
	my @lines = read_lines($path);

	# Gather the language and territory attributes of the locale templates.
	my $attr_by = map_locale_attributes($prefix);

	# Use column(1) to write out a nicely columnated list.
	my $pipe = open_column("\037");

	for my $line (@lines) {
		my ($read_locale, $charmap) = split ' ', $line;

		# The names of the templates don't incorporate a codeset part.
		my $locale = $read_locale =~ s/\.[^@]+//r;

		# Select only UTF-8 locales and refrain from incorporating the
		# C.UTF-8 locale because is always compiled by locale-gen(8).
		next if $charmap ne 'UTF-8' || $locale eq 'C';

		# Compose a commented entry which also has a trailing comment,
		# indicating the language and territory in plain English.
		my ($comment, $territory) = $attr_by->{$locale}->@{'language', 'territory'};
		if (! length $comment) {
			die "Can't find a language attribute for '$read_locale'";
		} else {
			if (length $territory) {
				$comment .= " ($territory)";
			}
			printf {$pipe} "# %s\037%s\037# %s\n", $read_locale, $charmap, $comment;
		}
	}
	close $pipe or exit 1;
}

sub map_locale_attributes ($prefix) {
	my $top = catdir($prefix, '/usr/share/i18n/locales');
	opendir(my $dh, $top) or die "Can't open '$top' as a directory: $!";
	my $regex = qr/
		^
		language  # attribute key
		\h+       # one or more <blank> characters
		"([^"]+)" # non-empty attribute value
		\n        # line break
		territory
		\h+
		"([^"]*)" # attribute value
		$
	/mx;
	my %attr_by;
	while (my $locale = readdir $dh) {
		next if $locale =~ m/^\./;
		my $data = read_text("$top/$locale");
		if ($data =~ $regex) {
			my ($language, $territory) = ($1, ucfirst $2);
			for ($language, $territory) {
				if (m/[^\p{ASCII}]/) {
					$_ = to_ascii($_);
				}
			}
			for ($territory) {
				if (m/^Myanmar/) {
					$_ = 'Myanmar/Burma';
				} elsif (m/^Russian Federation\z/) {
					$_ = 'Russia';
				} elsif (m/^Turkiye\z/) {
					$_ = 'Turkey';
				}
			}
			$attr_by{$locale} = {
				'language'  => $language,
				'territory' => $territory
			};
		}
	}
	return \%attr_by;
}

sub to_ascii ($str) {
	# This behaves similarly to "iconv -f UTF-8 -t US-ASCII//TRANSLIT". At
	# least, to a degree that is sufficient for the inputs being processed.
	$str = NFKD($str);
	$str =~ s/\p{NonspacingMark}//g;
	return $str;
}

sub open_column ($sep) {
	# Ensure that column(1) considers its input stream as US-ASCII.
	local $ENV{'LC_ALL'} = 'C';

	# The encoding is applied as a precaution; over-wide characters written
	# to the pipe become Perl escape sequences e.g. U+00E5 => \x{00e5}.
	if (! open my $pipe, '|-:encoding(US-ASCII)', 'column', '-ts', $sep) {
		exit 127;
	} else {
		return $pipe;
	}
}
