Deep Dive into Linux TProxy

TOC

This was my first foray into the kernel networking stack. If you spot any errors, feel free to let me know (email) and I will annotate corrections in the article.

TProxy (Transparent Proxy) is a kernel-supported transparent proxying mechanism introduced in Linux 2.6.28. Unlike NAT, which modifies the packet’s destination address for redirection, TProxy merely replaces the socket held by the packet’s skb, without modifying packet headers.

Terminology note: TProxy is the general name for the feature, while TPROXY is the name of an iptables extension.

IP_TRANSPARENT

The IP_TRANSPARENT option allows a socket to treat any non-local address as a local address, enabling it to bind to non-local addresses and masquerade as a non-local address when sending and receiving data.

int opt = 1;
setsockopt(sockfd, SOL_IP, IP_TRANSPARENT, &opt, sizeof(opt));

For example, a gateway (192.168.0.1 / 123.x.x.94) acting as a transparent proxy intercepts the connection between a client (192.168.0.200) and a remote server (157.x.x.149). It connects to the remote server on behalf of the client, while also masquerading as the remote server when communicating with the client:

$ netstat -atunp
Proto Recv-Q Send-Q Local Address           Foreign Address            State       PID/Program name
tcp        0      0 123.x.x.94:37338        157.x.x.149:443            ESTABLISHED 2904/proxy
tcp        0      0 ::ffff:157.x.x.149:443  ::ffff:192.168.0.200:56418 ESTABLISHED 2904/proxy

Inbound Redirection

Why Replace the Socket

When the kernel networking stack receives a packet, it looks up the most closely matching socket from the corresponding protocol’s hash table based on the packet’s 5-tuple, then places the packet into that socket’s receive queue. Taking UDP as an example:

// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/udp.c#L2405
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
		   int proto)
{
	// ...
	sk = skb_steal_sock(skb, &refcounted);
	if (sk) {
		// ...
		ret = udp_unicast_rcv_skb(sk, skb, uh);

static inline struct sock *
skb_steal_sock(struct sk_buff *skb, bool *refcounted)
{
	if (skb->sk) {
		struct sock *sk = skb->sk;
		// ...
		return sk;

static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
			       struct udphdr *uh)
{
	// ...
	ret = udp_queue_rcv_skb(sk, skb);

Netfilter hooks execute before the protocol stack, so modifying skb->sk in netfilter determines which socket’s receive queue the packet will ultimately be placed into.

Kernel Implementation

Based on kernel v6.1.34, using the iptables TPROXY module implementation as an example. The nftables implementation is essentially the same.

Core Logic

The main processing flow is in tproxy_tg4() from net/netfilter/xt_TPROXY.c.

Extract headers from the skb:

static unsigned int
tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
	   u_int32_t mark_mask, u_int32_t mark_value)
{
	const struct iphdr *iph = ip_hdr(skb);
	struct udphdr _hdr, *hp;
	struct sock *sk;

	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
	if (hp == NULL)
		return NF_DROP;

Then begin searching for a socket (sk in the code) to replace the packet skb’s original socket.

If a previous packet with the same 4-tuple was already redirected, then the proxy should have already established a connection with the client, and the current packet should also be redirected to that connection:

	/* check if there's an ongoing connection on the packet
	 * addresses, this happens if the redirect already happened
	 * and the current packet belongs to an already established
	 * connection */
	sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
				   iph->saddr, iph->daddr,
				   hp->source, hp->dest,
				   skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);

Set the default redirection destination — unprocessed packets should all be redirected here. The rule-specified address takes priority; otherwise, the primary address of the receiving network device is used:

	laddr = nf_tproxy_laddr4(skb, laddr, iph->daddr);
	if (!lport)
		lport = hp->dest;

__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
{
	const struct in_ifaddr *ifa;
	struct in_device *indev;
	__be32 laddr;

	if (user_laddr)
		return user_laddr;

	laddr = 0;
	indev = __in_dev_get_rcu(skb->dev);

	in_dev_for_each_ifa_rcu(ifa, indev) {
		if (ifa->ifa_flags & IFA_F_SECONDARY)
			continue;

		laddr = ifa->ifa_local;
		break;
	}

	return laddr ? laddr : daddr;
}

Forward SYN packets to the proxy to establish new connections instead of reusing TIME_WAIT connections. My guess is that this allows the proxy to more easily synchronize the state of both sides of the connection (client <-> proxy <-> remote):

	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
	if (sk && sk->sk_state == TCP_TIME_WAIT)
		/* reopening a TIME_WAIT connection needs special handling */
		sk = nf_tproxy_handle_time_wait4(net, skb, laddr, lport, sk);

/**
 * nf_tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
 * @skb:	The skb being processed.
 * @laddr:	IPv4 address to redirect to or zero.
 * @lport:	TCP port to redirect to or zero.
 * @sk:		The TIME_WAIT TCP socket found by the lookup.
 *
 * We have to handle SYN packets arriving to TIME_WAIT sockets
 * differently: instead of reopening the connection we should rather
 * redirect the new connection to the proxy if there's a listener
 * socket present.
 *
 * nf_tproxy_handle_time_wait4() consumes the socket reference passed in.
 *
 * Returns the listener socket if there's one, the TIME_WAIT socket if
 * no such listener is found, or NULL if the TCP header is incomplete.
 */
struct sock *
nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
			 __be32 laddr, __be16 lport, struct sock *sk)
{
	const struct iphdr *iph = ip_hdr(skb);
	struct tcphdr _hdr, *hp;

	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
	if (hp == NULL) {
		inet_twsk_put(inet_twsk(sk));
		return NULL;
	}

	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
		/* SYN to a TIME_WAIT socket, we'd rather redirect it
		 * to a listener socket if there's one */
		struct sock *sk2;

		sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
					    iph->saddr, laddr ? laddr : iph->daddr,
					    hp->source, lport ? lport : hp->dest,
					    skb->dev, NF_TPROXY_LOOKUP_LISTENER);
		if (sk2) {
			nf_tproxy_twsk_deschedule_put(inet_twsk(sk));
			sk = sk2;
		}
	}

	return sk;
}

If no established connection was matched, use the listening-state redirection destination socket:

	else if (!sk)
		/* no, there's no established connection, check if
		 * there's a listener on the redirected addr/port */
		sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
					   iph->saddr, laddr,
					   hp->source, lport,
					   skb->dev, NF_TPROXY_LOOKUP_LISTENER);

Finally, verify that the new socket meets the transparent proxy requirements, then replace the packet skb’s original socket:

	/* NOTE: assign_sock consumes our sk reference */
	if (sk && nf_tproxy_sk_is_transparent(sk)) {
		/* This should be in a separate target, but we don't do multiple
		   targets on the same rule yet */
		skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
		nf_tproxy_assign_sock(skb, sk);
		return NF_ACCEPT;
	}

	return NF_DROP;
}

/* assign a socket to the skb -- consumes sk */
static inline void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
{
	skb_orphan(skb);
	skb->sk = sk;
	skb->destructor = sock_edemux;
}

Socket Matching

nf_tproxy_get_sock_v4() is a simple wrapper around the generic TCP/UDP socket matching methods.

// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/netfilter/nf_tproxy_ipv4.c#L75
/*
 * This is used when the user wants to intercept a connection matching
 * an explicit iptables rule. In this case the sockets are assumed
 * matching in preference order:
 *
 *   - match: if there's a fully established connection matching the
 *     _packet_ tuple, it is returned, assuming the redirection
 *     already took place and we process a packet belonging to an
 *     established connection
 *
 *   - match: if there's a listening socket matching the redirection
 *     (e.g. on-port & on-ip of the connection), it is returned,
 *     regardless if it was bound to 0.0.0.0 or an explicit
 *     address. The reasoning is that if there's an explicit rule, it
 *     does not really matter if the listener is bound to an interface
 *     or to 0. The user already stated that he wants redirection
 *     (since he added the rule).
 *
 * Please note that there's an overlap between what a TPROXY target
 * and a socket match will match. Normally if you have both rules the
 * "socket" match will be the first one, effectively all packets
 * belonging to established connections going through that one.
 */
struct sock *
nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
		      const u8 protocol,
		      const __be32 saddr, const __be32 daddr,
		      const __be16 sport, const __be16 dport,
		      const struct net_device *in,
		      const enum nf_tproxy_lookup_t lookup_type)
{
	struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
	struct sock *sk;
	switch (protocol) {

TCP has corresponding matching methods for both states. The only extra step is incrementing the reference count for listening-state sockets to prevent them from being garbage collected:

	case IPPROTO_TCP: {
		struct tcphdr _hdr, *hp;

		hp = skb_header_pointer(skb, ip_hdrlen(skb),
					sizeof(struct tcphdr), &_hdr);
		if (hp == NULL)
			return NULL;

		switch (lookup_type) {
		case NF_TPROXY_LOOKUP_LISTENER:
			sk = inet_lookup_listener(net, hinfo, skb,
						  ip_hdrlen(skb) + __tcp_hdrlen(hp),
						  saddr, sport, daddr, dport,
						  in->ifindex, 0);

			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
				sk = NULL;
			/* NOTE: we return listeners even if bound to
			 * 0.0.0.0, those are filtered out in
			 * xt_socket, since xt_TPROXY needs 0 bound
			 * listeners too
			 */
			break;
		case NF_TPROXY_LOOKUP_ESTABLISHED:
			sk = inet_lookup_established(net, hinfo, saddr, sport,
						     daddr, dport, in->ifindex);
			break;
		default:
			BUG();
		}
		break;
		}

UDP requires additional checks to determine whether the match result is usable:

	case IPPROTO_UDP:
		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
				     in->ifindex);
		if (sk) {
			int connected = (sk->sk_state == TCP_ESTABLISHED);
			int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);

			/* NOTE: we return listeners even if bound to
			 * 0.0.0.0, those are filtered out in
			 * xt_socket, since xt_TPROXY needs 0 bound
			 * listeners too
			 */
			if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED &&
			      (!connected || wildcard)) ||
			    (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
				sock_put(sk);
				sk = NULL;
			}
		}
		break;

There are two qualifying conditions:

connected indicates whether the socket is “connected”
wildcard indicates whether the bind address is INADDR_ANY (0.0.0.0)

However, the condition !connected || wildcard is puzzling, because when connected is true, wildcard is necessarily false, making || wildcard redundant.

When a UDP socket connect()s to a target, it enters the connected state. If it was not previously bound to an exact IP that could be written into the IP packet’s destination address field, then during connect() the system’s static routing selects a local address to use as both the source address and the local bind address, and assigns it to the inet_rcv_saddr field. Only a disconnect will set the inet_rcv_saddr field back to INADDR_ANY:

// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/datagram.c#L64
int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	//...

	if (!inet->inet_saddr)
		inet->inet_saddr = fl4->saddr;	/* Update source address */
	if (!inet->inet_rcv_saddr) {
		inet->inet_rcv_saddr = fl4->saddr;
		if (sk->sk_prot->rehash)
			sk->sk_prot->rehash(sk);
	}

	// ...

	sk->sk_state = TCP_ESTABLISHED;

	// ...
}

int __udp_disconnect(struct sock *sk, int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	/*
	 *	1003.1g - break association.
	 */

	sk->sk_state = TCP_CLOSE;

	// ...

	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
		inet_reset_saddr(sk);

	// ...
}

static __inline__ void inet_reset_saddr(struct sock *sk)
{
	inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0;

Therefore, a connected UDP socket’s inet_rcv_saddr is always an exact IP address and can never be INADDR_ANY.

The commit that added these qualifying conditions mentions that nf_tproxy_get_sock_v4() is also used by the iptables socket extension. I suspect this might be a historical artifact.

Usage

Using the iptables TPROXY extension as an example:

Specify the redirection destination with --on-port/--on-ip
Since the packet’s destination address is not modified, the routing decision after PREROUTING will still forward the packet to the FORWARD chain because the destination is not a local address. Therefore, policy routing is needed to steer the packet into the INPUT chain

ip rule add fwmark 0x233 table 100
ip route add local default dev lo table 100

iptables -t mangle -A PREROUTING -p udp -j TPROXY --on-ip 127.0.0.1 --on-port 10000 --tproxy-mark 0x233
iptables -t mangle -A PREROUTING -p tcp -j TPROXY --on-ip 127.0.0.1 --on-port 10000 --tproxy-mark 0x233

This replaces the packet’s original socket with the one bound to :10000, while also setting the 0x233 fwmark. Policy routing is configured so that all packets with the 0x233 fwmark use routing table 100. The local type rule in table 100 achieves the destinations are assigned to this host. **The packets are looped back and delivered locally**. (documentation), ~~and packets sent from the loopback device are all treated as destined for the local host~~, thereby preventing them from being forwarded out.

2025/03/31 Update:

“Packets sent from the loopback device are all treated as destined for the local host” is incorrect. The real key is the routing rule ip route add local default dev lo table 100, where local forces the packet to be received locally. So when the packet comes back out of lo and reaches the Routing decision after PREROUTING again, it is considered destined for the local host and is delivered to INPUT.

Therefore, the inbound/outbound flow works like this:

Inbound traffic -> PREROUTING, fwmark is added to the packet -> Routing decision finds the fwmark matches a routing rule, local forces local delivery -> forwarded to lo -> comes back out of lo as inbound traffic again -> PREROUTING -> Routing decision determines the packet is destined for the local host -> INPUT
Outbound traffic -> OUTPUT, fwmark is added -> Routing decision finds the fwmark matches a routing rule… (the rest follows the same flow as inbound)

Using `-m socket` for Traffic Splitting to Improve Performance

There is no very clear explanation for this; the following is my personal understanding and speculation.

The comment in nf_tproxy_get_sock_v4() mentions this point:

/*
 * Please note that there's an overlap between what a TPROXY target
 * and a socket match will match. Normally if you have both rules the
 * "socket" match will be the first one, effectively all packets
 * belonging to established connections going through that one.
*/

After a packet redirected by TProxy establishes a connection, the networking stack has a mapping between the packet’s original 5-tuple and the socket. Subsequent packets for that connection will match the socket through the stack’s normal processing — the same socket that TPROXY’s sk = nf_tproxy_get_sock_v4(...., NF_TPROXY_LOOKUP_ESTABLISHED) would match — which is already the redirected one, making the subsequent replacement unnecessary.

2024/06/17 Update: Analysis of the performance difference.

In TProxy, nf_tproxy_assign_sock is executed to replace the sk. The skb_orphan call within it invokes the skb destructor sock_edemux, which calls sock_gen_put to decrement the sk’s reference count. But for “already-redirected connections,” this is entirely redundant, because the old and new sk are the same.

In contrast, the socket module only needs to call sock_gen_put when the found sk differs from the one associated with the skb.

Therefore, the redundant and frequent invocations of sock_gen_put in TProxy can impact performance to some degree.

Additionally, since TProxy and socket were committed together. I speculate that the developers intended transparent proxying to be a collaborative effort between these two modules: socket handles established connections, while TProxy handles new connections. This also explains why TProxy does not check sk != skb->sk when replacing the sk — perhaps precisely because the developers assumed that TProxy mostly handles new connections that have not been redirected yet, and the established connection check is just a safety fallback.

It is relatively uncommon for proxy programs to connect() to the client for UDP, so only TCP is used as an example here:

iptables -t mangle -N tproxy_divert
iptables -t mangle -A tproxy_divert -j MARK --set-mark 0x233
iptables -t mangle -A tproxy_divert -j ACCEPT

iptables -t mangle -A PREROUTING -p tcp -m socket -j tproxy_divert
iptables -t mangle -A PREROUTING -p tcp -j TPROXY --on-port 10000 --on-ip 127.0.0.1 --tproxy-mark 0x233

Retrieving the Original Destination Address

TCP

Use getsockname() to obtain the “local” address of the client socket, which is the packet’s original destination address:

client_fd = accept(server_fd, (struct sockaddr*)&client_addr, &addr_len);

getsockname(client_fd, (struct sockaddr*) orig_dst, &addrlen)

UDP

Use setsockopt(..., SOL_IP, IP_RECVORIGDSTADDR, ...) to set the socket option so that recvmsg() provides IP_RECVORIGDST ancillary data, which is the packet’s destination address. Thanks to TProxy not modifying the original packet, this ancillary information is obtained from the IP header:

// /net/ipv4/ip_sockglue.c
static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
{
	struct sockaddr_in sin;
	const struct iphdr *iph = ip_hdr(skb);
	__be16 *ports = (__be16 *)skb_transport_header(skb);

	if (skb_transport_offset(skb) + 4 > (int)skb->len)
		return;

	/* All current transport protocols have the port numbers in the
	 * first four bytes of the transport header and this function is
	 * written with this assumption in mind.
	 */

	sin.sin_family = AF_INET;
	sin.sin_addr.s_addr = iph->daddr;
	sin.sin_port = ports[1];
	memset(sin.sin_zero, 0, sizeof(sin.sin_zero));

	put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
}

Use recvmsg() to read the packet and its ancillary data
The ancillary data with level SOL_IP and type IP_ORIGDSTADDR contains the original destination address

Complete example:

#include <arpa/inet.h>
#include <netinet/in.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>

#define MAX_BUF_SIZE 1024
#define SRC_ADDR INADDR_ANY
#define SRC_PORT 9999

int main() {
  int sockfd;
  struct sockaddr_in bind_addr, client_addr;
  char buffer[MAX_BUF_SIZE];

  if ((sockfd = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
    perror("socket");
    exit(EXIT_FAILURE);
  }

  int opt = 1;
  if (setsockopt(sockfd, SOL_IP, IP_TRANSPARENT, &opt, sizeof(opt)) < 0) {
    perror("IP_TRANSPARENT");
    exit(EXIT_FAILURE);
  }

  // bind
  memset(&bind_addr, 0, sizeof(bind_addr));
  bind_addr.sin_family = AF_INET;
  bind_addr.sin_addr.s_addr = htonl(SRC_ADDR);
  bind_addr.sin_port = htons(SRC_PORT);
  if (bind(sockfd, (struct sockaddr *)&bind_addr, sizeof(bind_addr)) < 0) {
    perror("bind");
    exit(EXIT_FAILURE);
  }

  // recvmsg
  if (setsockopt(sockfd, SOL_IP, IP_RECVORIGDSTADDR, &opt, sizeof(opt)) < 0) {
    perror("IP_RECVORIGDSTADDR");
    exit(EXIT_FAILURE);
  }
  while (1) {
    memset(buffer, 0, sizeof(buffer));
    struct msghdr msgh = {0};
    struct iovec iov[1];
    iov[0].iov_base = buffer;
    iov[0].iov_len = sizeof(buffer);
    msgh.msg_iov = iov;
    msgh.msg_iovlen = 1;
    msgh.msg_name = &client_addr;
    msgh.msg_namelen = sizeof(client_addr);
    char cmsgbuf[CMSG_SPACE(sizeof(int))];
    msgh.msg_control = cmsgbuf;
    msgh.msg_controllen = sizeof(cmsgbuf);
    if (recvmsg(sockfd, &msgh, 0) < 0) {
      perror("recvmsg");
      continue;
    }

    struct cmsghdr *cmsg;
    for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
         cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
      if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_ORIGDSTADDR) {
        struct sockaddr_in *addr = (struct sockaddr_in *)CMSG_DATA(cmsg);
        printf("Original DST ADDR: %s\n", inet_ntoa(addr->sin_addr));
        break;
      }
    }
    printf("Data: %s\n", buffer);
  }

  close(sockfd);

  return 0;
}

References

Examples: