xref: /haiku/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp (revision 97dfeb96704e5dbc5bec32ad7b21379d0125e031)
1 /*
2  * Copyright 2006-2010, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Galante, haiku.galante@gmail.com
7  *		Axel Dörfler, axeld@pinc-software.de
8  *		Hugo Santos, hugosantos@gmail.com
9  */
10 
11 
12 #include "TCPEndpoint.h"
13 
14 #include <netinet/in.h>
15 #include <netinet/ip.h>
16 #include <netinet/tcp.h>
17 #include <new>
18 #include <signal.h>
19 #include <stdlib.h>
20 #include <string.h>
21 
22 #include <KernelExport.h>
23 #include <Select.h>
24 
25 #include <net_buffer.h>
26 #include <net_datalink.h>
27 #include <net_stat.h>
28 #include <NetBufferUtilities.h>
29 #include <NetUtilities.h>
30 
31 #include <lock.h>
32 #include <tracing.h>
33 #include <util/AutoLock.h>
34 #include <util/list.h>
35 
36 #include "EndpointManager.h"
37 
38 
39 // References:
40 //	- RFC 793 - Transmission Control Protocol
41 //	- RFC 813 - Window and Acknowledgement Strategy in TCP
42 //	- RFC 1337 - TIME_WAIT Assassination Hazards in TCP
43 //
44 // Things this implementation currently doesn't implement:
45 //	- TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery,
46 //	  RFC 2001, RFC 2581, RFC 3042
47 //	- NewReno Modification to TCP's Fast Recovery, RFC 2582
48 //	- Explicit Congestion Notification (ECN), RFC 3168
49 //	- SYN-Cache
50 //	- TCP Extensions for High Performance, RFC 1323
51 //	- SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517
52 //	- Forward RTO-Recovery, RFC 4138
53 //	- Time-Wait hash instead of keeping sockets alive
54 
55 #define PrintAddress(address) \
56 	AddressString(Domain(), address, true).Data()
57 
58 //#define TRACE_TCP
59 //#define PROBE_TCP
60 
61 #ifdef TRACE_TCP
62 // the space before ', ##args' is important in order for this to work with cpp 2.95
63 #	define TRACE(format, args...)	dprintf("%" B_PRId32 ": TCP [%" \
64 		B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \
65 		system_time(), this, name_for_state(fState) , ##args)
66 #else
67 #	define TRACE(args...)			do { } while (0)
68 #endif
69 
70 #ifdef PROBE_TCP
71 #	define PROBE(buffer, window) \
72 	dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %" B_PRIu32 \
73 		" suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \
74 		B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %" \
75 		B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \
76 		system_time(), PrintAddress(buffer->source), \
77 		PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \
78 		fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \
79 		window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \
80 		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
81 #else
82 #	define PROBE(buffer, window)	do { } while (0)
83 #endif
84 
85 #if TCP_TRACING
86 namespace TCPTracing {
87 
88 class Receive : public AbstractTraceEntry {
89 public:
90 	Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
91 			net_buffer* buffer)
92 		:
93 		fEndpoint(endpoint),
94 		fBuffer(buffer),
95 		fBufferSize(buffer->size),
96 		fSequence(segment.sequence),
97 		fAcknowledge(segment.acknowledge),
98 		fWindow(window),
99 		fState(endpoint->State()),
100 		fFlags(segment.flags)
101 	{
102 		Initialized();
103 	}
104 
105 	virtual void AddDump(TraceOutput& out)
106 	{
107 		out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 " bytes), "
108 			"flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
109 			", wnd %" B_PRIu32, fEndpoint, name_for_state(fState), fBuffer,
110 			fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
111 	}
112 
113 protected:
114 	TCPEndpoint*	fEndpoint;
115 	net_buffer*		fBuffer;
116 	uint32			fBufferSize;
117 	uint32			fSequence;
118 	uint32			fAcknowledge;
119 	uint32			fWindow;
120 	tcp_state		fState;
121 	uint8			fFlags;
122 };
123 
124 class Send : public AbstractTraceEntry {
125 public:
126 	Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
127 			tcp_sequence firstSequence, tcp_sequence lastSequence)
128 		:
129 		fEndpoint(endpoint),
130 		fBuffer(buffer),
131 		fBufferSize(buffer->size),
132 		fSequence(segment.sequence),
133 		fAcknowledge(segment.acknowledge),
134 		fFirstSequence(firstSequence.Number()),
135 		fLastSequence(lastSequence.Number()),
136 		fState(endpoint->State()),
137 		fFlags(segment.flags)
138 	{
139 		Initialized();
140 	}
141 
142 	virtual void AddDump(TraceOutput& out)
143 	{
144 		out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), "
145 			"flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
146 			", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint,
147 			name_for_state(fState), fBuffer, fBufferSize, fFlags, fSequence,
148 			fAcknowledge, fFirstSequence, fLastSequence);
149 	}
150 
151 protected:
152 	TCPEndpoint*	fEndpoint;
153 	net_buffer*		fBuffer;
154 	uint32			fBufferSize;
155 	uint32			fSequence;
156 	uint32			fAcknowledge;
157 	uint32			fFirstSequence;
158 	uint32			fLastSequence;
159 	tcp_state		fState;
160 	uint8			fFlags;
161 };
162 
163 class State : public AbstractTraceEntry {
164 public:
165 	State(TCPEndpoint* endpoint)
166 		:
167 		fEndpoint(endpoint),
168 		fState(endpoint->State())
169 	{
170 		Initialized();
171 	}
172 
173 	virtual void AddDump(TraceOutput& out)
174 	{
175 		out.Print("tcp:%p (%12s) state change", fEndpoint,
176 			name_for_state(fState));
177 	}
178 
179 protected:
180 	TCPEndpoint*	fEndpoint;
181 	tcp_state		fState;
182 };
183 
184 class Spawn : public AbstractTraceEntry {
185 public:
186 	Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint)
187 		:
188 		fListeningEndpoint(listeningEndpoint),
189 		fSpawnedEndpoint(spawnedEndpoint)
190 	{
191 		Initialized();
192 	}
193 
194 	virtual void AddDump(TraceOutput& out)
195 	{
196 		out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint);
197 	}
198 
199 protected:
200 	TCPEndpoint*	fListeningEndpoint;
201 	TCPEndpoint*	fSpawnedEndpoint;
202 };
203 
204 class Error : public AbstractTraceEntry {
205 public:
206 	Error(TCPEndpoint* endpoint, const char* error, int32 line)
207 		:
208 		fEndpoint(endpoint),
209 		fLine(line),
210 		fError(error),
211 		fState(endpoint->State())
212 	{
213 		Initialized();
214 	}
215 
216 	virtual void AddDump(TraceOutput& out)
217 	{
218 		out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s", fEndpoint,
219 			name_for_state(fState), fLine, fError);
220 	}
221 
222 protected:
223 	TCPEndpoint*	fEndpoint;
224 	int32			fLine;
225 	const char*		fError;
226 	tcp_state		fState;
227 };
228 
229 class TimerSet : public AbstractTraceEntry {
230 public:
231 	TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout)
232 		:
233 		fEndpoint(endpoint),
234 		fWhich(which),
235 		fTimeout(timeout),
236 		fState(endpoint->State())
237 	{
238 		Initialized();
239 	}
240 
241 	virtual void AddDump(TraceOutput& out)
242 	{
243 		out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME, fEndpoint,
244 			name_for_state(fState), fWhich, fTimeout);
245 	}
246 
247 protected:
248 	TCPEndpoint*	fEndpoint;
249 	const char*		fWhich;
250 	bigtime_t		fTimeout;
251 	tcp_state		fState;
252 };
253 
254 class TimerTriggered : public AbstractTraceEntry {
255 public:
256 	TimerTriggered(TCPEndpoint* endpoint, const char* which)
257 		:
258 		fEndpoint(endpoint),
259 		fWhich(which),
260 		fState(endpoint->State())
261 	{
262 		Initialized();
263 	}
264 
265 	virtual void AddDump(TraceOutput& out)
266 	{
267 		out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint,
268 			name_for_state(fState), fWhich);
269 	}
270 
271 protected:
272 	TCPEndpoint*	fEndpoint;
273 	const char*		fWhich;
274 	tcp_state		fState;
275 };
276 
277 class APICall : public AbstractTraceEntry {
278 public:
279 	APICall(TCPEndpoint* endpoint, const char* which)
280 		:
281 		fEndpoint(endpoint),
282 		fWhich(which),
283 		fState(endpoint->State())
284 	{
285 		Initialized();
286 	}
287 
288 	virtual void AddDump(TraceOutput& out)
289 	{
290 		out.Print("tcp:%p (%12s) api call: %s", fEndpoint,
291 			name_for_state(fState), fWhich);
292 	}
293 
294 protected:
295 	TCPEndpoint*	fEndpoint;
296 	const char*		fWhich;
297 	tcp_state		fState;
298 };
299 
300 }	// namespace TCPTracing
301 
302 #	define T(x)	new(std::nothrow) TCPTracing::x
303 #else
304 #	define T(x)
305 #endif	// TCP_TRACING
306 
307 
308 // constants for the fFlags field
309 enum {
310 	FLAG_OPTION_WINDOW_SCALE	= 0x01,
311 	FLAG_OPTION_TIMESTAMP		= 0x02,
312 	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
313 	//       That is, what is expected from accept() after a shutdown()
314 	//       is performed on a listen()ing socket.
315 	FLAG_NO_RECEIVE				= 0x04,
316 	FLAG_CLOSED					= 0x08,
317 	FLAG_DELETE_ON_CLOSE		= 0x10,
318 	FLAG_LOCAL					= 0x20
319 };
320 
321 
322 static const int kTimestampFactor = 1000;
323 	// conversion factor between usec system time and msec tcp time
324 
325 
326 static inline bigtime_t
327 absolute_timeout(bigtime_t timeout)
328 {
329 	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
330 		return timeout;
331 
332 	return timeout + system_time();
333 }
334 
335 
336 static inline status_t
337 posix_error(status_t error)
338 {
339 	if (error == B_TIMED_OUT)
340 		return B_WOULD_BLOCK;
341 
342 	return error;
343 }
344 
345 
346 static inline bool
347 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
348 	uint32 receiveWindow)
349 {
350 	return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
351 }
352 
353 
354 static inline bool
355 segment_in_sequence(const tcp_segment_header& segment, int size,
356 	const tcp_sequence& receiveNext, uint32 receiveWindow)
357 {
358 	tcp_sequence sequence(segment.sequence);
359 	if (size == 0) {
360 		if (receiveWindow == 0)
361 			return sequence == receiveNext;
362 		return in_window(sequence, receiveNext, receiveWindow);
363 	} else {
364 		if (receiveWindow == 0)
365 			return false;
366 		return in_window(sequence, receiveNext, receiveWindow)
367 			|| in_window(sequence + size - 1, receiveNext, receiveWindow);
368 	}
369 }
370 
371 
372 static inline bool
373 is_writable(tcp_state state)
374 {
375 	return state == ESTABLISHED || state == FINISH_RECEIVED;
376 }
377 
378 
379 static inline bool
380 is_establishing(tcp_state state)
381 {
382 	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED;
383 }
384 
385 
386 static inline uint32 tcp_now()
387 {
388 	return system_time() / kTimestampFactor;
389 }
390 
391 
392 static inline uint32 tcp_diff_timestamp(uint32 base)
393 {
394 	uint32 now = tcp_now();
395 
396 	if (now > base)
397 		return now - base;
398 
399 	return now + UINT_MAX - base;
400 }
401 
402 
403 static inline bool
404 state_needs_finish(int32 state)
405 {
406 	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
407 		|| state == FINISH_SENT || state == CLOSING;
408 }
409 
410 
411 //	#pragma mark -
412 
413 
414 TCPEndpoint::TCPEndpoint(net_socket* socket)
415 	:
416 	ProtocolSocket(socket),
417 	fManager(NULL),
418 	fOptions(0),
419 	fSendWindowShift(0),
420 	fReceiveWindowShift(0),
421 	fSendUnacknowledged(0),
422 	fSendNext(0),
423 	fSendMax(0),
424 	fSendUrgentOffset(0),
425 	fSendWindow(0),
426 	fSendMaxWindow(0),
427 	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
428 	fSendQueue(socket->send.buffer_size),
429 	fInitialSendSequence(0),
430 	fDuplicateAcknowledgeCount(0),
431 	fRoute(NULL),
432 	fReceiveNext(0),
433 	fReceiveMaxAdvertised(0),
434 	fReceiveWindow(socket->receive.buffer_size),
435 	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
436 	fReceiveQueue(socket->receive.buffer_size),
437 	fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor),
438 	fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor),
439 	fRetransmitTimeout(TCP_INITIAL_RTT),
440 	fReceivedTimestamp(0),
441 	fCongestionWindow(0),
442 	fSlowStartThreshold(0),
443 	fState(CLOSED),
444 	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP)
445 {
446 	// TODO: to be replaced with a real read/write locking strategy!
447 	mutex_init(&fLock, "tcp lock");
448 
449 	fReceiveCondition.Init(this, "tcp receive");
450 	fSendCondition.Init(this, "tcp send");
451 
452 	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
453 	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
454 		this);
455 	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
456 		TCPEndpoint::_DelayedAcknowledgeTimer, this);
457 	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
458 		this);
459 
460 	T(APICall(this, "constructor"));
461 }
462 
463 
464 TCPEndpoint::~TCPEndpoint()
465 {
466 	mutex_lock(&fLock);
467 
468 	T(APICall(this, "destructor"));
469 
470 	_CancelConnectionTimers();
471 	gStackModule->cancel_timer(&fTimeWaitTimer);
472 	T(TimerSet(this, "time-wait", -1));
473 
474 	if (fManager != NULL) {
475 		fManager->Unbind(this);
476 		put_endpoint_manager(fManager);
477 	}
478 
479 	mutex_destroy(&fLock);
480 
481 	// we need to wait for all timers to return
482 	gStackModule->wait_for_timer(&fRetransmitTimer);
483 	gStackModule->wait_for_timer(&fPersistTimer);
484 	gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer);
485 	gStackModule->wait_for_timer(&fTimeWaitTimer);
486 
487 	gDatalinkModule->put_route(Domain(), fRoute);
488 }
489 
490 
491 status_t
492 TCPEndpoint::InitCheck() const
493 {
494 	return B_OK;
495 }
496 
497 
498 //	#pragma mark - protocol API
499 
500 
501 status_t
502 TCPEndpoint::Open()
503 {
504 	TRACE("Open()");
505 	T(APICall(this, "open"));
506 
507 	status_t status = ProtocolSocket::Open();
508 	if (status < B_OK)
509 		return status;
510 
511 	fManager = get_endpoint_manager(Domain());
512 	if (fManager == NULL)
513 		return EAFNOSUPPORT;
514 
515 	return B_OK;
516 }
517 
518 
519 status_t
520 TCPEndpoint::Close()
521 {
522 	MutexLocker locker(fLock);
523 
524 	TRACE("Close()");
525 	T(APICall(this, "close"));
526 
527 	if (fState == LISTEN)
528 		delete_sem(fAcceptSemaphore);
529 
530 	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
531 		// TODO: what about linger in case of SYNCHRONIZE_SENT?
532 		fState = CLOSED;
533 		T(State(this));
534 		return B_OK;
535 	}
536 
537 	status_t status = _Disconnect(true);
538 	if (status != B_OK)
539 		return status;
540 
541 	if (socket->options & SO_LINGER) {
542 		TRACE("Close(): Lingering for %i secs", socket->linger);
543 
544 		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
545 
546 		while (fSendQueue.Used() > 0) {
547 			status = _WaitForCondition(fSendCondition, locker, maximum);
548 			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
549 				break;
550 			else if (status < B_OK)
551 				return status;
552 		}
553 
554 		TRACE("Close(): after waiting, the SendQ was left with %" B_PRIuSIZE
555 			" bytes.", fSendQueue.Used());
556 	}
557 	return B_OK;
558 }
559 
560 
561 void
562 TCPEndpoint::Free()
563 {
564 	MutexLocker _(fLock);
565 
566 	TRACE("Free()");
567 	T(APICall(this, "free"));
568 
569 	if (fState <= SYNCHRONIZE_SENT)
570 		return;
571 
572 	// we are only interested in the timer, not in changing state
573 	_EnterTimeWait();
574 
575 	fFlags |= FLAG_CLOSED;
576 	if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) {
577 		// we'll be freed later when the 2MSL timer expires
578 		gSocketModule->acquire_socket(socket);
579 	}
580 }
581 
582 
583 /*!	Creates and sends a synchronize packet to /a address, and then waits
584 	until the connection has been established or refused.
585 */
586 status_t
587 TCPEndpoint::Connect(const sockaddr* address)
588 {
589 	if (!AddressModule()->is_same_family(address))
590 		return EAFNOSUPPORT;
591 
592 	MutexLocker locker(fLock);
593 
594 	TRACE("Connect() on address %s", PrintAddress(address));
595 	T(APICall(this, "connect"));
596 
597 	if (gStackModule->is_restarted_syscall()) {
598 		bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
599 		status_t status = _WaitForEstablished(locker, timeout);
600 		TRACE("  Connect(): Connection complete: %s (timeout was %"
601 			B_PRIdBIGTIME ")", strerror(status), timeout);
602 		return posix_error(status);
603 	}
604 
605 	// Can only call connect() from CLOSED or LISTEN states
606 	// otherwise endpoint is considered already connected
607 	if (fState == LISTEN) {
608 		// this socket is about to connect; remove pending connections in the backlog
609 		gSocketModule->set_max_backlog(socket, 0);
610 	} else if (fState == ESTABLISHED) {
611 		return EISCONN;
612 	} else if (fState != CLOSED)
613 		return EINPROGRESS;
614 
615 	// consider destination address INADDR_ANY as INADDR_LOOPBACK
616 	sockaddr_storage _address;
617 	if (AddressModule()->is_empty_address(address, false)) {
618 		AddressModule()->get_loopback_address((sockaddr *)&_address);
619 		// for IPv4 and IPv6 the port is at the same offset
620 		((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port;
621 		address = (sockaddr *)&_address;
622 	}
623 
624 	status_t status = _PrepareSendPath(address);
625 	if (status < B_OK)
626 		return status;
627 
628 	TRACE("  Connect(): starting 3-way handshake...");
629 
630 	fState = SYNCHRONIZE_SENT;
631 	T(State(this));
632 
633 	// send SYN
634 	status = _SendQueued();
635 	if (status != B_OK) {
636 		_Close();
637 		return status;
638 	}
639 
640 	// If we are running over Loopback, after _SendQueued() returns we
641 	// may be in ESTABLISHED already.
642 	if (fState == ESTABLISHED) {
643 		TRACE("  Connect() completed after _SendQueued()");
644 		return B_OK;
645 	}
646 
647 	// wait until 3-way handshake is complete (if needed)
648 	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
649 	if (timeout == 0) {
650 		// we're a non-blocking socket
651 		TRACE("  Connect() delayed, return EINPROGRESS");
652 		return EINPROGRESS;
653 	}
654 
655 	bigtime_t absoluteTimeout = absolute_timeout(timeout);
656 	gStackModule->store_syscall_restart_timeout(absoluteTimeout);
657 
658 	status = _WaitForEstablished(locker, absoluteTimeout);
659 	TRACE("  Connect(): Connection complete: %s (timeout was %" B_PRIdBIGTIME
660 		")", strerror(status), timeout);
661 	return posix_error(status);
662 }
663 
664 
665 status_t
666 TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
667 {
668 	MutexLocker locker(fLock);
669 
670 	TRACE("Accept()");
671 	T(APICall(this, "accept"));
672 
673 	status_t status;
674 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
675 	if (gStackModule->is_restarted_syscall())
676 		timeout = gStackModule->restore_syscall_restart_timeout();
677 	else
678 		gStackModule->store_syscall_restart_timeout(timeout);
679 
680 	do {
681 		locker.Unlock();
682 
683 		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
684 			| B_CAN_INTERRUPT, timeout);
685 		if (status != B_OK) {
686 			if (status == B_TIMED_OUT && socket->receive.timeout == 0)
687 				return B_WOULD_BLOCK;
688 
689 			return status;
690 		}
691 
692 		locker.Lock();
693 		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
694 #ifdef TRACE_TCP
695 		if (status == B_OK)
696 			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
697 #endif
698 	} while (status != B_OK);
699 
700 	return status;
701 }
702 
703 
704 status_t
705 TCPEndpoint::Bind(const sockaddr *address)
706 {
707 	if (address == NULL)
708 		return B_BAD_VALUE;
709 
710 	MutexLocker lock(fLock);
711 
712 	TRACE("Bind() on address %s", PrintAddress(address));
713 	T(APICall(this, "bind"));
714 
715 	if (fState != CLOSED)
716 		return EISCONN;
717 
718 	return fManager->Bind(this, address);
719 }
720 
721 
722 status_t
723 TCPEndpoint::Unbind(struct sockaddr *address)
724 {
725 	MutexLocker _(fLock);
726 
727 	TRACE("Unbind()");
728 	T(APICall(this, "unbind"));
729 
730 	return fManager->Unbind(this);
731 }
732 
733 
734 status_t
735 TCPEndpoint::Listen(int count)
736 {
737 	MutexLocker _(fLock);
738 
739 	TRACE("Listen()");
740 	T(APICall(this, "listen"));
741 
742 	if (fState != CLOSED && fState != LISTEN)
743 		return B_BAD_VALUE;
744 
745 	if (fState == CLOSED) {
746 		fAcceptSemaphore = create_sem(0, "tcp accept");
747 		if (fAcceptSemaphore < B_OK)
748 			return ENOBUFS;
749 
750 		status_t status = fManager->SetPassive(this);
751 		if (status != B_OK) {
752 			delete_sem(fAcceptSemaphore);
753 			fAcceptSemaphore = -1;
754 			return status;
755 		}
756 	}
757 
758 	gSocketModule->set_max_backlog(socket, count);
759 
760 	fState = LISTEN;
761 	T(State(this));
762 	return B_OK;
763 }
764 
765 
766 status_t
767 TCPEndpoint::Shutdown(int direction)
768 {
769 	MutexLocker lock(fLock);
770 
771 	TRACE("Shutdown(%i)", direction);
772 	T(APICall(this, "shutdown"));
773 
774 	if (direction == SHUT_RD || direction == SHUT_RDWR)
775 		fFlags |= FLAG_NO_RECEIVE;
776 
777 	if (direction == SHUT_WR || direction == SHUT_RDWR) {
778 		// TODO: That's not correct. After read/write shutting down the socket
779 		// one should still be able to read previously arrived data.
780 		_Disconnect(false);
781 	}
782 
783 	return B_OK;
784 }
785 
786 
787 /*!	Puts data contained in \a buffer into send buffer */
788 status_t
789 TCPEndpoint::SendData(net_buffer *buffer)
790 {
791 	MutexLocker lock(fLock);
792 
793 	TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32
794 		") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer,
795 		buffer->size, buffer->flags, fSendQueue.Size(), fSendQueue.Free());
796 	T(APICall(this, "senddata"));
797 
798 	uint32 flags = buffer->flags;
799 
800 	if (fState == CLOSED)
801 		return ENOTCONN;
802 	if (fState == LISTEN)
803 		return EDESTADDRREQ;
804 	if (!is_writable(fState) && !is_establishing(fState)) {
805 		// we only send signals when called from userland
806 		if (gStackModule->is_syscall() && (flags & MSG_NOSIGNAL) == 0)
807 			send_signal(find_thread(NULL), SIGPIPE);
808 		return EPIPE;
809 	}
810 
811 	size_t left = buffer->size;
812 
813 	bigtime_t timeout = absolute_timeout(socket->send.timeout);
814 	if (gStackModule->is_restarted_syscall())
815 		timeout = gStackModule->restore_syscall_restart_timeout();
816 	else
817 		gStackModule->store_syscall_restart_timeout(timeout);
818 
819 	while (left > 0) {
820 		while (fSendQueue.Free() < socket->send.low_water_mark) {
821 			// wait until enough space is available
822 			status_t status = _WaitForCondition(fSendCondition, lock, timeout);
823 			if (status < B_OK) {
824 				TRACE("  SendData() returning %s (%d)",
825 					strerror(posix_error(status)), (int)posix_error(status));
826 				return posix_error(status);
827 			}
828 
829 			if (!is_writable(fState) && !is_establishing(fState)) {
830 				// we only send signals when called from userland
831 				if (gStackModule->is_syscall())
832 					send_signal(find_thread(NULL), SIGPIPE);
833 				return EPIPE;
834 			}
835 		}
836 
837 		size_t size = fSendQueue.Free();
838 		if (size < left) {
839 			// we need to split the original buffer
840 			net_buffer* clone = gBufferModule->clone(buffer, false);
841 				// TODO: add offset/size parameter to net_buffer::clone() or
842 				// even a move_data() function, as this is a bit inefficient
843 			if (clone == NULL)
844 				return ENOBUFS;
845 
846 			status_t status = gBufferModule->trim(clone, size);
847 			if (status != B_OK) {
848 				gBufferModule->free(clone);
849 				return status;
850 			}
851 
852 			gBufferModule->remove_header(buffer, size);
853 			left -= size;
854 			fSendQueue.Add(clone);
855 		} else {
856 			left -= buffer->size;
857 			fSendQueue.Add(buffer);
858 		}
859 	}
860 
861 	TRACE("  SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used());
862 
863 	bool force = false;
864 	if ((flags & MSG_OOB) != 0) {
865 		fSendUrgentOffset = fSendQueue.LastSequence();
866 			// RFC 961 specifies that the urgent offset points to the last
867 			// byte of urgent data. However, this is commonly implemented as
868 			// here, ie. it points to the first byte after the urgent data.
869 		force = true;
870 	}
871 	if ((flags & MSG_EOF) != 0)
872 		_Disconnect(false);
873 
874 	if (fState == ESTABLISHED || fState == FINISH_RECEIVED)
875 		_SendQueued(force);
876 
877 	return B_OK;
878 }
879 
880 
881 ssize_t
882 TCPEndpoint::SendAvailable()
883 {
884 	MutexLocker locker(fLock);
885 
886 	ssize_t available;
887 
888 	if (is_writable(fState))
889 		available = fSendQueue.Free();
890 	else if (is_establishing(fState))
891 		available = 0;
892 	else
893 		available = EPIPE;
894 
895 	TRACE("SendAvailable(): %" B_PRIdSSIZE, available);
896 	T(APICall(this, "sendavailable"));
897 	return available;
898 }
899 
900 
901 status_t
902 TCPEndpoint::FillStat(net_stat *stat)
903 {
904 	MutexLocker _(fLock);
905 
906 	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
907 	stat->receive_queue_size = fReceiveQueue.Available();
908 	stat->send_queue_size = fSendQueue.Used();
909 
910 	return B_OK;
911 }
912 
913 
914 status_t
915 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
916 {
917 	MutexLocker locker(fLock);
918 
919 	TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes,
920 		flags);
921 	T(APICall(this, "readdata"));
922 
923 	*_buffer = NULL;
924 
925 	if (fState == CLOSED)
926 		return ENOTCONN;
927 
928 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
929 	if (gStackModule->is_restarted_syscall())
930 		timeout = gStackModule->restore_syscall_restart_timeout();
931 	else
932 		gStackModule->store_syscall_restart_timeout(timeout);
933 
934 	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
935 		if (flags & MSG_DONTWAIT)
936 			return B_WOULD_BLOCK;
937 
938 		status_t status = _WaitForEstablished(locker, timeout);
939 		if (status < B_OK)
940 			return posix_error(status);
941 	}
942 
943 	size_t dataNeeded = socket->receive.low_water_mark;
944 
945 	// When MSG_WAITALL is set then the function should block
946 	// until the full amount of data can be returned.
947 	if (flags & MSG_WAITALL)
948 		dataNeeded = numBytes;
949 
950 	// TODO: add support for urgent data (MSG_OOB)
951 
952 	while (true) {
953 		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
954 			|| fState == TIME_WAIT) {
955 			// ``Connection closing''.
956 			return B_OK;
957 		}
958 
959 		if (fReceiveQueue.Available() > 0) {
960 			if (fReceiveQueue.Available() >= dataNeeded
961 				|| (fReceiveQueue.PushedData() > 0
962 					&& fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
963 				break;
964 		} else if (fState == FINISH_RECEIVED) {
965 			// ``If no text is awaiting delivery, the RECEIVE will
966 			//   get a Connection closing''.
967 			return B_OK;
968 		}
969 
970 		if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0)
971 			return B_WOULD_BLOCK;
972 
973 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
974 			return B_OK;
975 
976 		status_t status = _WaitForCondition(fReceiveCondition, locker, timeout);
977 		if (status < B_OK) {
978 			// The Open Group base specification mentions that EINTR should be
979 			// returned if the recv() is interrupted before _any data_ is
980 			// available. So we actually check if there is data, and if so,
981 			// push it to the user.
982 			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
983 				&& fReceiveQueue.Available() > 0)
984 				break;
985 
986 			return posix_error(status);
987 		}
988 	}
989 
990 	TRACE("  ReadData(): %" B_PRIuSIZE " are available.",
991 		fReceiveQueue.Available());
992 
993 	if (numBytes < fReceiveQueue.Available())
994 		fReceiveCondition.NotifyAll();
995 
996 	bool clone = (flags & MSG_PEEK) != 0;
997 
998 	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
999 
1000 	TRACE("  ReadData(): %" B_PRIuSIZE " bytes kept.",
1001 		fReceiveQueue.Available());
1002 
1003 	// if we are opening the window, check if we should send an ACK
1004 	if (!clone)
1005 		SendAcknowledge(false);
1006 
1007 	return receivedBytes;
1008 }
1009 
1010 
1011 ssize_t
1012 TCPEndpoint::ReadAvailable()
1013 {
1014 	MutexLocker locker(fLock);
1015 
1016 	TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData());
1017 	T(APICall(this, "readavailable"));
1018 
1019 	return _AvailableData();
1020 }
1021 
1022 
1023 status_t
1024 TCPEndpoint::SetSendBufferSize(size_t length)
1025 {
1026 	MutexLocker _(fLock);
1027 	fSendQueue.SetMaxBytes(length);
1028 	return B_OK;
1029 }
1030 
1031 
1032 status_t
1033 TCPEndpoint::SetReceiveBufferSize(size_t length)
1034 {
1035 	MutexLocker _(fLock);
1036 	fReceiveQueue.SetMaxBytes(length);
1037 	return B_OK;
1038 }
1039 
1040 
1041 status_t
1042 TCPEndpoint::GetOption(int option, void* _value, int* _length)
1043 {
1044 	if (*_length != sizeof(int))
1045 		return B_BAD_VALUE;
1046 
1047 	int* value = (int*)_value;
1048 
1049 	switch (option) {
1050 		case TCP_NODELAY:
1051 			if ((fOptions & TCP_NODELAY) != 0)
1052 				*value = 1;
1053 			else
1054 				*value = 0;
1055 			return B_OK;
1056 
1057 		case TCP_MAXSEG:
1058 			*value = fReceiveMaxSegmentSize;
1059 			return B_OK;
1060 
1061 		default:
1062 			return B_BAD_VALUE;
1063 	}
1064 }
1065 
1066 
1067 status_t
1068 TCPEndpoint::SetOption(int option, const void* _value, int length)
1069 {
1070 	if (option != TCP_NODELAY)
1071 		return B_BAD_VALUE;
1072 
1073 	if (length != sizeof(int))
1074 		return B_BAD_VALUE;
1075 
1076 	const int* value = (const int*)_value;
1077 
1078 	MutexLocker _(fLock);
1079 	if (*value)
1080 		fOptions |= TCP_NODELAY;
1081 	else
1082 		fOptions &= ~TCP_NODELAY;
1083 
1084 	return B_OK;
1085 }
1086 
1087 
1088 //	#pragma mark - misc
1089 
1090 
1091 bool
1092 TCPEndpoint::IsBound() const
1093 {
1094 	return !LocalAddress().IsEmpty(true);
1095 }
1096 
1097 
1098 bool
1099 TCPEndpoint::IsLocal() const
1100 {
1101 	return (fFlags & FLAG_LOCAL) != 0;
1102 }
1103 
1104 
1105 status_t
1106 TCPEndpoint::DelayedAcknowledge()
1107 {
1108 	if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
1109 		// timer was active, send an ACK now (with the exception above,
1110 		// we send every other ACK)
1111 		T(TimerSet(this, "delayed ack", -1));
1112 		return SendAcknowledge(true);
1113 	}
1114 
1115 	gStackModule->set_timer(&fDelayedAcknowledgeTimer,
1116 		TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
1117 	T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT));
1118 	return B_OK;
1119 }
1120 
1121 
1122 status_t
1123 TCPEndpoint::SendAcknowledge(bool force)
1124 {
1125 	return _SendQueued(force, 0);
1126 }
1127 
1128 
1129 void
1130 TCPEndpoint::_StartPersistTimer()
1131 {
1132 	gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT);
1133 	T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT));
1134 }
1135 
1136 
1137 void
1138 TCPEndpoint::_EnterTimeWait()
1139 {
1140 	TRACE("_EnterTimeWait()");
1141 
1142 	if (fState == TIME_WAIT) {
1143 		_CancelConnectionTimers();
1144 
1145 		if (IsLocal()) {
1146 			// we do not use TIME_WAIT state for local connections
1147 			fFlags |= FLAG_DELETE_ON_CLOSE;
1148 			return;
1149 		}
1150 	}
1151 
1152 	_UpdateTimeWait();
1153 }
1154 
1155 
1156 void
1157 TCPEndpoint::_UpdateTimeWait()
1158 {
1159 	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1160 	T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1));
1161 }
1162 
1163 
1164 void
1165 TCPEndpoint::_CancelConnectionTimers()
1166 {
1167 	gStackModule->cancel_timer(&fRetransmitTimer);
1168 	T(TimerSet(this, "retransmit", -1));
1169 	gStackModule->cancel_timer(&fPersistTimer);
1170 	T(TimerSet(this, "persist", -1));
1171 	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1172 	T(TimerSet(this, "delayed ack", -1));
1173 }
1174 
1175 
1176 /*!	Sends the FIN flag to the peer when the connection is still open.
1177 	Moves the endpoint to the next state depending on where it was.
1178 */
1179 status_t
1180 TCPEndpoint::_Disconnect(bool closing)
1181 {
1182 	tcp_state previousState = fState;
1183 
1184 	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1185 		fState = FINISH_SENT;
1186 	else if (fState == FINISH_RECEIVED)
1187 		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1188 	else
1189 		return B_OK;
1190 
1191 	T(State(this));
1192 
1193 	status_t status = _SendQueued();
1194 	if (status != B_OK) {
1195 		fState = previousState;
1196 		T(State(this));
1197 		return status;
1198 	}
1199 
1200 	return B_OK;
1201 }
1202 
1203 
1204 void
1205 TCPEndpoint::_MarkEstablished()
1206 {
1207 	fState = ESTABLISHED;
1208 	T(State(this));
1209 
1210 	if (gSocketModule->has_parent(socket)) {
1211 		gSocketModule->set_connected(socket);
1212 		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1213 	}
1214 
1215 	fSendCondition.NotifyAll();
1216 	gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
1217 }
1218 
1219 
1220 status_t
1221 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1222 {
1223 	// TODO: Checking for CLOSED seems correct, but breaks several neon tests.
1224 	// When investigating this, also have a look at _Close() and _HandleReset().
1225 	while (fState < ESTABLISHED/* && fState != CLOSED*/) {
1226 		if (socket->error != B_OK)
1227 			return socket->error;
1228 
1229 		status_t status = _WaitForCondition(fSendCondition, locker, timeout);
1230 		if (status < B_OK)
1231 			return status;
1232 	}
1233 
1234 	return B_OK;
1235 }
1236 
1237 
1238 //	#pragma mark - receive
1239 
1240 
1241 void
1242 TCPEndpoint::_Close()
1243 {
1244 	_CancelConnectionTimers();
1245 	fState = CLOSED;
1246 	T(State(this));
1247 
1248 	fFlags |= FLAG_DELETE_ON_CLOSE;
1249 
1250 	fSendCondition.NotifyAll();
1251 	_NotifyReader();
1252 
1253 	if (gSocketModule->has_parent(socket)) {
1254 		// We still have a parent - obviously, we haven't been accepted yet,
1255 		// so no one could ever close us.
1256 		_CancelConnectionTimers();
1257 		gSocketModule->set_aborted(socket);
1258 	}
1259 }
1260 
1261 
1262 void
1263 TCPEndpoint::_HandleReset(status_t error)
1264 {
1265 	socket->error = error;
1266 	_Close();
1267 
1268 	gSocketModule->notify(socket, B_SELECT_WRITE, error);
1269 	gSocketModule->notify(socket, B_SELECT_ERROR, error);
1270 }
1271 
1272 
1273 void
1274 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1275 {
1276 	if (++fDuplicateAcknowledgeCount < 3)
1277 		return;
1278 
1279 	if (fDuplicateAcknowledgeCount == 3) {
1280 		_ResetSlowStart();
1281 		fCongestionWindow = fSlowStartThreshold + 3 * fSendMaxSegmentSize;
1282 		fSendNext = segment.acknowledge;
1283 	} else if (fDuplicateAcknowledgeCount > 3)
1284 		fCongestionWindow += fSendMaxSegmentSize;
1285 
1286 	_SendQueued();
1287 }
1288 
1289 
1290 void
1291 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1292 	size_t segmentLength)
1293 {
1294 	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1295 		tcp_sequence sequence(segment.sequence);
1296 
1297 		if (fLastAcknowledgeSent >= sequence
1298 			&& fLastAcknowledgeSent < (sequence + segmentLength))
1299 			fReceivedTimestamp = segment.timestamp_value;
1300 	}
1301 }
1302 
1303 
1304 ssize_t
1305 TCPEndpoint::_AvailableData() const
1306 {
1307 	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1308 	//       the application of FLAG_NO_RECEIVE in listen()ing
1309 	//       sockets.
1310 	if (fState == LISTEN)
1311 		return gSocketModule->count_connected(socket);
1312 	if (fState == SYNCHRONIZE_SENT)
1313 		return 0;
1314 
1315 	ssize_t availableData = fReceiveQueue.Available();
1316 
1317 	if (availableData == 0 && !_ShouldReceive())
1318 		return ENOTCONN;
1319 
1320 	return availableData;
1321 }
1322 
1323 
1324 void
1325 TCPEndpoint::_NotifyReader()
1326 {
1327 	fReceiveCondition.NotifyAll();
1328 	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1329 }
1330 
1331 
1332 bool
1333 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1334 {
1335 	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1336 		// Remember the position of the finish received flag
1337 		fFinishReceived = true;
1338 		fFinishReceivedAt = segment.sequence + buffer->size;
1339 	}
1340 
1341 	fReceiveQueue.Add(buffer, segment.sequence);
1342 	fReceiveNext = fReceiveQueue.NextSequence();
1343 
1344 	if (fFinishReceived) {
1345 		// Set or reset the finish flag on the current segment
1346 		if (fReceiveNext < fFinishReceivedAt)
1347 			segment.flags &= ~TCP_FLAG_FINISH;
1348 		else
1349 			segment.flags |= TCP_FLAG_FINISH;
1350 	}
1351 
1352 	TRACE("  _AddData(): adding data, receive next = %" B_PRIu32 ". Now have %"
1353 		B_PRIuSIZE " bytes.", fReceiveNext.Number(), fReceiveQueue.Available());
1354 
1355 	if ((segment.flags & TCP_FLAG_PUSH) != 0)
1356 		fReceiveQueue.SetPushPointer();
1357 
1358 	return fReceiveQueue.Available() > 0;
1359 }
1360 
1361 
1362 void
1363 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1364 {
1365 	fInitialReceiveSequence = segment.sequence;
1366 	fFinishReceived = false;
1367 
1368 	// count the received SYN
1369 	segment.sequence++;
1370 
1371 	fReceiveNext = segment.sequence;
1372 	fReceiveQueue.SetInitialSequence(segment.sequence);
1373 
1374 	if ((fOptions & TCP_NOOPT) == 0) {
1375 		if (segment.max_segment_size > 0)
1376 			fSendMaxSegmentSize = segment.max_segment_size;
1377 
1378 		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1379 			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1380 			fSendWindowShift = segment.window_shift;
1381 		} else {
1382 			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1383 			fReceiveWindowShift = 0;
1384 		}
1385 
1386 		if (segment.options & TCP_HAS_TIMESTAMPS) {
1387 			fFlags |= FLAG_OPTION_TIMESTAMP;
1388 			fReceivedTimestamp = segment.timestamp_value;
1389 		} else
1390 			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1391 	}
1392 
1393 	fCongestionWindow = 2 * fSendMaxSegmentSize;
1394 	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1395 }
1396 
1397 
1398 bool
1399 TCPEndpoint::_ShouldReceive() const
1400 {
1401 	if ((fFlags & FLAG_NO_RECEIVE) != 0)
1402 		return false;
1403 
1404 	return fState == ESTABLISHED || fState == FINISH_SENT
1405 		|| fState == FINISH_ACKNOWLEDGED;
1406 }
1407 
1408 
1409 int32
1410 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1411 	net_buffer* buffer)
1412 {
1413 	MutexLocker _(fLock);
1414 
1415 	// TODO error checking
1416 	ProtocolSocket::Open();
1417 
1418 	fState = SYNCHRONIZE_RECEIVED;
1419 	T(Spawn(parent, this));
1420 
1421 	fManager = parent->fManager;
1422 
1423 	LocalAddress().SetTo(buffer->destination);
1424 	PeerAddress().SetTo(buffer->source);
1425 
1426 	TRACE("Spawn()");
1427 
1428 	// TODO: proper error handling!
1429 	if (fManager->BindChild(this) != B_OK) {
1430 		T(Error(this, "binding failed", __LINE__));
1431 		return DROP;
1432 	}
1433 	if (_PrepareSendPath(*PeerAddress()) != B_OK) {
1434 		T(Error(this, "prepare send faild", __LINE__));
1435 		return DROP;
1436 	}
1437 
1438 	fOptions = parent->fOptions;
1439 	fAcceptSemaphore = parent->fAcceptSemaphore;
1440 
1441 	_PrepareReceivePath(segment);
1442 
1443 	// send SYN+ACK
1444 	if (_SendQueued() != B_OK) {
1445 		T(Error(this, "sending failed", __LINE__));
1446 		return DROP;
1447 	}
1448 
1449 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1450 		// we handled this flag now, it must not be set for further processing
1451 
1452 	return _Receive(segment, buffer);
1453 }
1454 
1455 
1456 int32
1457 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1458 {
1459 	TRACE("ListenReceive()");
1460 
1461 	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1462 	// but the error behaviour differs
1463 	if (segment.flags & TCP_FLAG_RESET)
1464 		return DROP;
1465 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1466 		return DROP | RESET;
1467 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1468 		return DROP;
1469 
1470 	// TODO: drop broadcast/multicast
1471 
1472 	// spawn new endpoint for accept()
1473 	net_socket* newSocket;
1474 	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) {
1475 		T(Error(this, "spawning failed", __LINE__));
1476 		return DROP;
1477 	}
1478 
1479 	return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1480 		segment, buffer);
1481 }
1482 
1483 
1484 int32
1485 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1486 	net_buffer *buffer)
1487 {
1488 	TRACE("_SynchronizeSentReceive()");
1489 
1490 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1491 		&& (fInitialSendSequence >= segment.acknowledge
1492 			|| fSendMax < segment.acknowledge))
1493 		return DROP | RESET;
1494 
1495 	if (segment.flags & TCP_FLAG_RESET) {
1496 		_HandleReset(ECONNREFUSED);
1497 		return DROP;
1498 	}
1499 
1500 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1501 		return DROP;
1502 
1503 	fSendUnacknowledged = segment.acknowledge;
1504 	_PrepareReceivePath(segment);
1505 
1506 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1507 		_MarkEstablished();
1508 	} else {
1509 		// simultaneous open
1510 		fState = SYNCHRONIZE_RECEIVED;
1511 		T(State(this));
1512 	}
1513 
1514 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1515 		// we handled this flag now, it must not be set for further processing
1516 
1517 	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1518 }
1519 
1520 
1521 int32
1522 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1523 {
1524 	uint32 advertisedWindow = (uint32)segment.advertised_window
1525 		<< fSendWindowShift;
1526 	size_t segmentLength = buffer->size;
1527 
1528 	// First, handle the most common case for uni-directional data transfer
1529 	// (known as header prediction - the segment must not change the window,
1530 	// and must be the expected sequence, and contain no control flags)
1531 
1532 	if (fState == ESTABLISHED
1533 		&& segment.AcknowledgeOnly()
1534 		&& fReceiveNext == segment.sequence
1535 		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1536 		&& fSendNext == fSendMax) {
1537 		_UpdateTimestamps(segment, segmentLength);
1538 
1539 		if (segmentLength == 0) {
1540 			// this is a pure acknowledge segment - we're on the sending end
1541 			if (fSendUnacknowledged < segment.acknowledge
1542 				&& fSendMax >= segment.acknowledge) {
1543 				_Acknowledged(segment);
1544 				return DROP;
1545 			}
1546 		} else if (segment.acknowledge == fSendUnacknowledged
1547 			&& fReceiveQueue.IsContiguous()
1548 			&& fReceiveQueue.Free() >= segmentLength
1549 			&& (fFlags & FLAG_NO_RECEIVE) == 0) {
1550 			if (_AddData(segment, buffer))
1551 				_NotifyReader();
1552 
1553 			return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1554 				? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1555 		}
1556 	}
1557 
1558 	// The fast path was not applicable, so we continue with the standard
1559 	// processing of the incoming segment
1560 
1561 	ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1562 
1563 	if (fState != CLOSED && fState != TIME_WAIT) {
1564 		// Check sequence number
1565 		if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1566 				fReceiveWindow)) {
1567 			TRACE("  Receive(): segment out of window, next: %" B_PRIu32
1568 				" wnd: %" B_PRIu32, fReceiveNext.Number(), fReceiveWindow);
1569 			if ((segment.flags & TCP_FLAG_RESET) != 0) {
1570 				// TODO: this doesn't look right - review!
1571 				return DROP;
1572 			}
1573 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1574 		}
1575 	}
1576 
1577 	if ((segment.flags & TCP_FLAG_RESET) != 0) {
1578 		// Is this a valid reset?
1579 		// We generally ignore resets in time wait state (see RFC 1337)
1580 		if (fLastAcknowledgeSent <= segment.sequence
1581 			&& tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1582 				+ fReceiveWindow)
1583 			&& fState != TIME_WAIT) {
1584 			status_t error;
1585 			if (fState == SYNCHRONIZE_RECEIVED)
1586 				error = ECONNREFUSED;
1587 			else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1588 				error = ENOTCONN;
1589 			else
1590 				error = ECONNRESET;
1591 
1592 			_HandleReset(error);
1593 		}
1594 
1595 		return DROP;
1596 	}
1597 
1598 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1599 		|| (fState == SYNCHRONIZE_RECEIVED
1600 			&& (fInitialReceiveSequence > segment.sequence
1601 				|| ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1602 					&& (fSendUnacknowledged > segment.acknowledge
1603 						|| fSendMax < segment.acknowledge))))) {
1604 		// reset the connection - either the initial SYN was faulty, or we
1605 		// received a SYN within the data stream
1606 		return DROP | RESET;
1607 	}
1608 
1609 	// TODO: Check this! Why do we advertize a window outside of what we should
1610 	// buffer?
1611 	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1612 		// the window must not shrink
1613 
1614 	// trim buffer to be within the receive window
1615 	int32 drop = (int32)(fReceiveNext - segment.sequence).Number();
1616 	if (drop > 0) {
1617 		if ((uint32)drop > buffer->size
1618 			|| ((uint32)drop == buffer->size
1619 				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1620 			// don't accidently remove a FIN we shouldn't remove
1621 			segment.flags &= ~TCP_FLAG_FINISH;
1622 			drop = buffer->size;
1623 		}
1624 
1625 		// remove duplicate data at the start
1626 		TRACE("* remove %" B_PRId32 " bytes from the start", drop);
1627 		gBufferModule->remove_header(buffer, drop);
1628 		segment.sequence += drop;
1629 	}
1630 
1631 	int32 action = KEEP;
1632 
1633 	drop = (int32)(segment.sequence + buffer->size
1634 		- (fReceiveNext + fReceiveWindow)).Number();
1635 	if (drop > 0) {
1636 		// remove data exceeding our window
1637 		if ((uint32)drop >= buffer->size) {
1638 			// if we can accept data, or the segment is not what we'd expect,
1639 			// drop the segment (an immediate acknowledge is always triggered)
1640 			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1641 				return DROP | IMMEDIATE_ACKNOWLEDGE;
1642 
1643 			action |= IMMEDIATE_ACKNOWLEDGE;
1644 		}
1645 
1646 		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1647 			// we need to remove the finish, too, as part of the data
1648 			drop--;
1649 		}
1650 
1651 		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1652 		TRACE("* remove %" B_PRId32 " bytes from the end", drop);
1653 		gBufferModule->remove_trailer(buffer, drop);
1654 	}
1655 
1656 #ifdef TRACE_TCP
1657 	if (advertisedWindow > fSendWindow) {
1658 		TRACE("  Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32,
1659 			fSendWindow, advertisedWindow);
1660 	}
1661 #endif
1662 
1663 	fSendWindow = advertisedWindow;
1664 	if (advertisedWindow > fSendMaxWindow)
1665 		fSendMaxWindow = advertisedWindow;
1666 
1667 	// Then look at the acknowledgement for any updates
1668 
1669 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1670 		// process acknowledged data
1671 		if (fState == SYNCHRONIZE_RECEIVED)
1672 			_MarkEstablished();
1673 
1674 		if (fSendMax < segment.acknowledge)
1675 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1676 
1677 		if (segment.acknowledge < fSendUnacknowledged) {
1678 			if (buffer->size == 0 && advertisedWindow == fSendWindow
1679 				&& (segment.flags & TCP_FLAG_FINISH) == 0) {
1680 				TRACE("Receive(): duplicate ack!");
1681 
1682 				_DuplicateAcknowledge(segment);
1683 			}
1684 
1685 			return DROP;
1686 		} else {
1687 			// this segment acknowledges in flight data
1688 
1689 			if (fDuplicateAcknowledgeCount >= 3) {
1690 				// deflate the window.
1691 				fCongestionWindow = fSlowStartThreshold;
1692 			}
1693 
1694 			fDuplicateAcknowledgeCount = 0;
1695 
1696 			if (fSendMax == segment.acknowledge)
1697 				TRACE("Receive(): all inflight data ack'd!");
1698 
1699 			if (segment.acknowledge > fSendQueue.LastSequence()
1700 					&& fState > ESTABLISHED) {
1701 				TRACE("Receive(): FIN has been acknowledged!");
1702 
1703 				switch (fState) {
1704 					case FINISH_SENT:
1705 						fState = FINISH_ACKNOWLEDGED;
1706 						T(State(this));
1707 						break;
1708 					case CLOSING:
1709 						fState = TIME_WAIT;
1710 						T(State(this));
1711 						_EnterTimeWait();
1712 						return DROP;
1713 					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1714 						_Close();
1715 						break;
1716 
1717 					default:
1718 						break;
1719 				}
1720 			}
1721 
1722 			if (fState != CLOSED)
1723 				_Acknowledged(segment);
1724 		}
1725 	}
1726 
1727 	if (segment.flags & TCP_FLAG_URGENT) {
1728 		if (fState == ESTABLISHED || fState == FINISH_SENT
1729 			|| fState == FINISH_ACKNOWLEDGED) {
1730 			// TODO: Handle urgent data:
1731 			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1732 			//  - signal the user that urgent data is available (SIGURG)
1733 		}
1734 	}
1735 
1736 	bool notify = false;
1737 
1738 	// The buffer may be freed if its data is added to the queue, so cache
1739 	// the size as we still need it later.
1740 	uint32 bufferSize = buffer->size;
1741 
1742 	if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0)
1743 		&& _ShouldReceive())
1744 		notify = _AddData(segment, buffer);
1745 	else {
1746 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1747 			fReceiveNext += buffer->size;
1748 
1749 		action = (action & ~KEEP) | DROP;
1750 	}
1751 
1752 	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1753 		segmentLength++;
1754 		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1755 			TRACE("Receive(): peer is finishing connection!");
1756 			fReceiveNext++;
1757 			notify = true;
1758 
1759 			// FIN implies PUSH
1760 			fReceiveQueue.SetPushPointer();
1761 
1762 			// we'll reply immediately to the FIN if we are not
1763 			// transitioning to TIME WAIT so we immediatly ACK it.
1764 			action |= IMMEDIATE_ACKNOWLEDGE;
1765 
1766 			// other side is closing connection; change states
1767 			switch (fState) {
1768 				case ESTABLISHED:
1769 				case SYNCHRONIZE_RECEIVED:
1770 					fState = FINISH_RECEIVED;
1771 					T(State(this));
1772 					break;
1773 				case FINISH_SENT:
1774 					// simultaneous close
1775 					fState = CLOSING;
1776 					T(State(this));
1777 					break;
1778 				case FINISH_ACKNOWLEDGED:
1779 					fState = TIME_WAIT;
1780 					T(State(this));
1781 					_EnterTimeWait();
1782 					break;
1783 				case TIME_WAIT:
1784 					_UpdateTimeWait();
1785 					break;
1786 
1787 				default:
1788 					break;
1789 			}
1790 		}
1791 	}
1792 
1793 	if (notify)
1794 		_NotifyReader();
1795 
1796 	if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1797 		action |= ACKNOWLEDGE;
1798 
1799 	_UpdateTimestamps(segment, segmentLength);
1800 
1801 	TRACE("Receive() Action %" B_PRId32, action);
1802 
1803 	return action;
1804 }
1805 
1806 
1807 int32
1808 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1809 {
1810 	MutexLocker locker(fLock);
1811 
1812 	TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s "
1813 		"to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
1814 		", wnd %" B_PRIu32, buffer, buffer->size, PrintAddress(buffer->source),
1815 		PrintAddress(buffer->destination), segment.flags, segment.sequence,
1816 		segment.acknowledge,
1817 		(uint32)segment.advertised_window << fSendWindowShift);
1818 	T(Receive(this, segment,
1819 		(uint32)segment.advertised_window << fSendWindowShift, buffer));
1820 	int32 segmentAction = DROP;
1821 
1822 	switch (fState) {
1823 		case LISTEN:
1824 			segmentAction = _ListenReceive(segment, buffer);
1825 			break;
1826 
1827 		case SYNCHRONIZE_SENT:
1828 			segmentAction = _SynchronizeSentReceive(segment, buffer);
1829 			break;
1830 
1831 		case SYNCHRONIZE_RECEIVED:
1832 		case ESTABLISHED:
1833 		case FINISH_RECEIVED:
1834 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1835 		case FINISH_SENT:
1836 		case FINISH_ACKNOWLEDGED:
1837 		case CLOSING:
1838 		case TIME_WAIT:
1839 		case CLOSED:
1840 			segmentAction = _Receive(segment, buffer);
1841 			break;
1842 	}
1843 
1844 	// process acknowledge action as asked for by the *Receive() method
1845 	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
1846 		SendAcknowledge(true);
1847 	else if (segmentAction & ACKNOWLEDGE)
1848 		DelayedAcknowledge();
1849 
1850 	if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
1851 			== (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
1852 		locker.Unlock();
1853 		gSocketModule->release_socket(socket);
1854 	}
1855 
1856 	return segmentAction;
1857 }
1858 
1859 
1860 //	#pragma mark - send
1861 
1862 
1863 inline uint8
1864 TCPEndpoint::_CurrentFlags()
1865 {
1866 	// we don't set FLAG_FINISH here, instead we do it
1867 	// conditionally below depending if we are sending
1868 	// the last bytes of the send queue.
1869 
1870 	switch (fState) {
1871 		case CLOSED:
1872 			return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1873 
1874 		case SYNCHRONIZE_SENT:
1875 			return TCP_FLAG_SYNCHRONIZE;
1876 		case SYNCHRONIZE_RECEIVED:
1877 			return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1878 
1879 		case ESTABLISHED:
1880 		case FINISH_RECEIVED:
1881 		case FINISH_ACKNOWLEDGED:
1882 		case TIME_WAIT:
1883 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1884 		case FINISH_SENT:
1885 		case CLOSING:
1886 			return TCP_FLAG_ACKNOWLEDGE;
1887 
1888 		default:
1889 			return 0;
1890 	}
1891 }
1892 
1893 
1894 inline bool
1895 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
1896 	uint32 segmentMaxSize, uint32 flightSize)
1897 {
1898 	if (length > 0) {
1899 		// Avoid the silly window syndrome - we only send a segment in case:
1900 		// - we have a full segment to send, or
1901 		// - we're at the end of our buffer queue, or
1902 		// - the buffer is at least larger than half of the maximum send window,
1903 		//   or
1904 		// - we're retransmitting data
1905 		if (length == segmentMaxSize
1906 			|| (fOptions & TCP_NODELAY) != 0
1907 			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
1908 			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
1909 			return true;
1910 	}
1911 
1912 	// check if we need to send a window update to the peer
1913 	if (segment.advertised_window > 0) {
1914 		// correct the window to take into account what already has been advertised
1915 		uint32 window = (segment.advertised_window << fReceiveWindowShift)
1916 			- (fReceiveMaxAdvertised - fReceiveNext).Number();
1917 
1918 		// if we can advertise a window larger than twice the maximum segment
1919 		// size, or half the maximum buffer size we send a window update
1920 		if (window >= (fReceiveMaxSegmentSize << 1)
1921 			|| window >= (socket->receive.buffer_size >> 1))
1922 			return true;
1923 	}
1924 
1925 	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
1926 			| TCP_FLAG_RESET)) != 0)
1927 		return true;
1928 
1929 	// We do have urgent data pending
1930 	if (fSendUrgentOffset > fSendNext)
1931 		return true;
1932 
1933 	// there is no reason to send a segment just now
1934 	return false;
1935 }
1936 
1937 
1938 status_t
1939 TCPEndpoint::_SendQueued(bool force)
1940 {
1941 	return _SendQueued(force, fSendWindow);
1942 }
1943 
1944 
1945 /*!	Sends one or more TCP segments with the data waiting in the queue, or some
1946 	specific flags that need to be sent.
1947 */
1948 status_t
1949 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
1950 {
1951 	if (fRoute == NULL)
1952 		return B_ERROR;
1953 
1954 	// in passive state?
1955 	if (fState == LISTEN)
1956 		return B_ERROR;
1957 
1958 	tcp_segment_header segment(_CurrentFlags());
1959 
1960 	if ((fOptions & TCP_NOOPT) == 0) {
1961 		if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
1962 			segment.options |= TCP_HAS_TIMESTAMPS;
1963 			segment.timestamp_reply = fReceivedTimestamp;
1964 			segment.timestamp_value = tcp_now();
1965 		}
1966 
1967 		if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1968 			&& fSendNext == fInitialSendSequence) {
1969 			// add connection establishment options
1970 			segment.max_segment_size = fReceiveMaxSegmentSize;
1971 			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
1972 				segment.options |= TCP_HAS_WINDOW_SCALE;
1973 				segment.window_shift = fReceiveWindowShift;
1974 			}
1975 		}
1976 	}
1977 
1978 	size_t availableBytes = fReceiveQueue.Free();
1979 	if (fFlags & FLAG_OPTION_WINDOW_SCALE)
1980 		segment.advertised_window = availableBytes >> fReceiveWindowShift;
1981 	else
1982 		segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes);
1983 
1984 	segment.acknowledge = fReceiveNext.Number();
1985 
1986 	// Process urgent data
1987 	if (fSendUrgentOffset > fSendNext) {
1988 		segment.flags |= TCP_FLAG_URGENT;
1989 		segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number();
1990 	} else {
1991 		fSendUrgentOffset = fSendUnacknowledged.Number();
1992 			// Keep urgent offset updated, so that it doesn't reach into our
1993 			// send window on overlap
1994 		segment.urgent_offset = 0;
1995 	}
1996 
1997 	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
1998 		sendWindow = fCongestionWindow;
1999 
2000 	// fSendUnacknowledged
2001 	//  |    fSendNext      fSendMax
2002 	//  |        |              |
2003 	//  v        v              v
2004 	//  -----------------------------------
2005 	//  | effective window           |
2006 	//  -----------------------------------
2007 
2008 	// Flight size represents the window of data which is currently in the
2009 	// ether. We should never send data such as the flight size becomes larger
2010 	// than the effective window. Note however that the effective window may be
2011 	// reduced (by congestion for instance), so at some point in time flight
2012 	// size may be larger than the currently calculated window.
2013 
2014 	uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
2015 	uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number();
2016 
2017 	if (consumedWindow > sendWindow) {
2018 		sendWindow = 0;
2019 		// TODO: enter persist state? try to get a window update.
2020 	} else
2021 		sendWindow -= consumedWindow;
2022 
2023 	if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) {
2024 		// send one byte of data to ask for a window update
2025 		// (triggered by the persist timer)
2026 		sendWindow = 1;
2027 	}
2028 
2029 	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
2030 	bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged;
2031 	bool retransmit = fSendNext < fSendMax;
2032 
2033 	do {
2034 		uint32 segmentMaxSize = fSendMaxSegmentSize
2035 			- tcp_options_length(segment);
2036 		uint32 segmentLength = min_c(length, segmentMaxSize);
2037 
2038 		if (fSendNext + segmentLength == fSendQueue.LastSequence()) {
2039 			if (state_needs_finish(fState))
2040 				segment.flags |= TCP_FLAG_FINISH;
2041 			if (length > 0)
2042 				segment.flags |= TCP_FLAG_PUSH;
2043 		}
2044 
2045 		// Determine if we should really send this segment
2046 		if (!force && !retransmit && !_ShouldSendSegment(segment, segmentLength,
2047 				segmentMaxSize, flightSize)) {
2048 			if (fSendQueue.Available()
2049 				&& !gStackModule->is_timer_active(&fPersistTimer)
2050 				&& !gStackModule->is_timer_active(&fRetransmitTimer))
2051 				_StartPersistTimer();
2052 			break;
2053 		}
2054 
2055 		net_buffer *buffer = gBufferModule->create(256);
2056 		if (buffer == NULL)
2057 			return B_NO_MEMORY;
2058 
2059 		status_t status = B_OK;
2060 		if (segmentLength > 0)
2061 			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
2062 		if (status < B_OK) {
2063 			gBufferModule->free(buffer);
2064 			return status;
2065 		}
2066 
2067 		LocalAddress().CopyTo(buffer->source);
2068 		PeerAddress().CopyTo(buffer->destination);
2069 
2070 		uint32 size = buffer->size;
2071 		segment.sequence = fSendNext.Number();
2072 
2073 		TRACE("SendQueued(): buffer %p (%" B_PRIu32 " bytes) address %s to "
2074 			"%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
2075 			", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %" B_PRIu32
2076 			", len %" B_PRIu32 ", first %" B_PRIu32 ", last %" B_PRIu32,
2077 			buffer, buffer->size, PrintAddress(buffer->source),
2078 			PrintAddress(buffer->destination), segment.flags, segment.sequence,
2079 			segment.acknowledge, segment.advertised_window,
2080 			fCongestionWindow, fSlowStartThreshold, segmentLength,
2081 			fSendQueue.FirstSequence().Number(),
2082 			fSendQueue.LastSequence().Number());
2083 		T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
2084 			fSendQueue.LastSequence()));
2085 
2086 		PROBE(buffer, sendWindow);
2087 		sendWindow -= buffer->size;
2088 
2089 		status = add_tcp_header(AddressModule(), segment, buffer);
2090 		if (status != B_OK) {
2091 			gBufferModule->free(buffer);
2092 			return status;
2093 		}
2094 
2095 		// Update send status - we need to do this before we send the data
2096 		// for local connections as the answer is directly handled
2097 
2098 		if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
2099 			segment.options &= ~TCP_HAS_WINDOW_SCALE;
2100 			segment.max_segment_size = 0;
2101 			size++;
2102 		}
2103 
2104 		if (segment.flags & TCP_FLAG_FINISH)
2105 			size++;
2106 
2107 		uint32 sendMax = fSendMax.Number();
2108 		fSendNext += size;
2109 		if (fSendMax < fSendNext)
2110 			fSendMax = fSendNext;
2111 
2112 		fReceiveMaxAdvertised = fReceiveNext
2113 			+ ((uint32)segment.advertised_window << fReceiveWindowShift);
2114 
2115 		status = next->module->send_routed_data(next, fRoute, buffer);
2116 		if (status < B_OK) {
2117 			gBufferModule->free(buffer);
2118 
2119 			fSendNext = segment.sequence;
2120 			fSendMax = sendMax;
2121 				// restore send status
2122 			return status;
2123 		}
2124 
2125 		if (shouldStartRetransmitTimer && size > 0) {
2126 			TRACE("starting initial retransmit timer of: %" B_PRIdBIGTIME,
2127 				fRetransmitTimeout);
2128 			gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2129 			T(TimerSet(this, "retransmit", fRetransmitTimeout));
2130 			shouldStartRetransmitTimer = false;
2131 		}
2132 
2133 		if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
2134 			fLastAcknowledgeSent = segment.acknowledge;
2135 
2136 		length -= segmentLength;
2137 		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
2138 			| TCP_FLAG_FINISH);
2139 
2140 		if (retransmit)
2141 			break;
2142 
2143 	} while (length > 0);
2144 
2145 	return B_OK;
2146 }
2147 
2148 
2149 int
2150 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
2151 {
2152 	return next->module->get_mtu(next, address) - sizeof(tcp_header);
2153 }
2154 
2155 
2156 status_t
2157 TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2158 {
2159 	if (fRoute == NULL) {
2160 		fRoute = gDatalinkModule->get_route(Domain(), peer);
2161 		if (fRoute == NULL)
2162 			return ENETUNREACH;
2163 
2164 		if ((fRoute->flags & RTF_LOCAL) != 0)
2165 			fFlags |= FLAG_LOCAL;
2166 	}
2167 
2168 	// make sure connection does not already exist
2169 	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2170 		fRoute->interface_address->local);
2171 	if (status < B_OK)
2172 		return status;
2173 
2174 	fInitialSendSequence = system_time() >> 4;
2175 	fSendNext = fInitialSendSequence;
2176 	fSendUnacknowledged = fInitialSendSequence;
2177 	fSendMax = fInitialSendSequence;
2178 	fSendUrgentOffset = fInitialSendSequence;
2179 
2180 	// we are counting the SYN here
2181 	fSendQueue.SetInitialSequence(fSendNext + 1);
2182 
2183 	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2184 
2185 	// Compute the window shift we advertise to our peer - if it doesn't support
2186 	// this option, this will be reset to 0 (when its SYN is received)
2187 	fReceiveWindowShift = 0;
2188 	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2189 		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2190 		fReceiveWindowShift++;
2191 	}
2192 
2193 	return B_OK;
2194 }
2195 
2196 
2197 void
2198 TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2199 {
2200 	TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %"
2201 		B_PRIu32 "; max %" B_PRIu32, segment.acknowledge,
2202 		fSendUnacknowledged.Number(), fSendNext.Number(), fSendMax.Number());
2203 
2204 	ASSERT(fSendUnacknowledged <= segment.acknowledge);
2205 
2206 	if (fSendUnacknowledged < segment.acknowledge) {
2207 		fSendQueue.RemoveUntil(segment.acknowledge);
2208 		fSendUnacknowledged = segment.acknowledge;
2209 		if (fSendNext < fSendUnacknowledged)
2210 			fSendNext = fSendUnacknowledged;
2211 
2212 		if (segment.options & TCP_HAS_TIMESTAMPS)
2213 			_UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply));
2214 		else {
2215 			// TODO: Fallback to RFC 793 type estimation; This just resets
2216 			// any potential exponential back off that happened due to
2217 			// retransmits.
2218 			fRetransmitTimeout = TCP_INITIAL_RTT;
2219 		}
2220 
2221 		if (fSendUnacknowledged == fSendMax) {
2222 			TRACE("all acknowledged, cancelling retransmission timer");
2223 			gStackModule->cancel_timer(&fRetransmitTimer);
2224 			T(TimerSet(this, "retransmit", -1));
2225 		} else {
2226 			TRACE("data acknowledged, resetting retransmission timer to: %"
2227 				B_PRIdBIGTIME, fRetransmitTimeout);
2228 			gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2229 			T(TimerSet(this, "retransmit", fRetransmitTimeout));
2230 		}
2231 
2232 		if (is_writable(fState)) {
2233 			// notify threads waiting on the socket to become writable again
2234 			fSendCondition.NotifyAll();
2235 			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
2236 		}
2237 
2238 		if (fCongestionWindow < fSlowStartThreshold)
2239 			fCongestionWindow += fSendMaxSegmentSize;
2240 	}
2241 
2242 	if (fCongestionWindow >= fSlowStartThreshold) {
2243 		uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2244 
2245 		if (increment < fCongestionWindow)
2246 			increment = 1;
2247 		else
2248 			increment /= fCongestionWindow;
2249 
2250 		fCongestionWindow += increment;
2251 	}
2252 
2253 	// if there is data left to be sent, send it now
2254 	if (fSendQueue.Used() > 0)
2255 		_SendQueued();
2256 }
2257 
2258 
2259 void
2260 TCPEndpoint::_Retransmit()
2261 {
2262 	TRACE("Retransmit()");
2263 
2264 	_ResetSlowStart();
2265 	fSendNext = fSendUnacknowledged;
2266 
2267 	// Do exponential back off of the retransmit timeout
2268 	fRetransmitTimeout *= 2;
2269 	if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
2270 		fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
2271 
2272 	_SendQueued();
2273 }
2274 
2275 
2276 void
2277 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime)
2278 {
2279 	int32 rtt = roundTripTime;
2280 
2281 	// "smooth" round trip time as per Van Jacobson
2282 	rtt -= fRoundTripTime / 8;
2283 	fRoundTripTime += rtt;
2284 	if (rtt < 0)
2285 		rtt = -rtt;
2286 	rtt -= fRoundTripDeviation / 4;
2287 	fRoundTripDeviation += rtt;
2288 
2289 	fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2)
2290 		* kTimestampFactor;
2291 	if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT)
2292 		fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT;
2293 
2294 	TRACE("  RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)",
2295 		fRetransmitTimeout, roundTripTime);
2296 }
2297 
2298 
2299 void
2300 TCPEndpoint::_ResetSlowStart()
2301 {
2302 	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2,
2303 		2 * fSendMaxSegmentSize);
2304 	fCongestionWindow = fSendMaxSegmentSize;
2305 }
2306 
2307 
2308 //	#pragma mark - timer
2309 
2310 
2311 /*static*/ void
2312 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2313 {
2314 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2315 	T(TimerTriggered(endpoint, "retransmit"));
2316 
2317 	MutexLocker locker(endpoint->fLock);
2318 	if (!locker.IsLocked())
2319 		return;
2320 
2321 	endpoint->_Retransmit();
2322 }
2323 
2324 
2325 /*static*/ void
2326 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2327 {
2328 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2329 	T(TimerTriggered(endpoint, "persist"));
2330 
2331 	MutexLocker locker(endpoint->fLock);
2332 	if (!locker.IsLocked())
2333 		return;
2334 
2335 	// the timer might not have been canceled early enough
2336 	if (endpoint->State() == CLOSED)
2337 		return;
2338 
2339 	endpoint->_SendQueued(true);
2340 }
2341 
2342 
2343 /*static*/ void
2344 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2345 {
2346 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2347 	T(TimerTriggered(endpoint, "delayed ack"));
2348 
2349 	MutexLocker locker(endpoint->fLock);
2350 	if (!locker.IsLocked())
2351 		return;
2352 
2353 	// the timer might not have been canceled early enough
2354 	if (endpoint->State() == CLOSED)
2355 		return;
2356 
2357 	endpoint->SendAcknowledge(true);
2358 }
2359 
2360 
2361 /*static*/ void
2362 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2363 {
2364 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2365 	T(TimerTriggered(endpoint, "time-wait"));
2366 
2367 	MutexLocker locker(endpoint->fLock);
2368 	if (!locker.IsLocked())
2369 		return;
2370 
2371 	if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2372 		endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2373 		return;
2374 	}
2375 
2376 	locker.Unlock();
2377 
2378 	gSocketModule->release_socket(endpoint->socket);
2379 }
2380 
2381 
2382 /*static*/ status_t
2383 TCPEndpoint::_WaitForCondition(ConditionVariable& condition,
2384 	MutexLocker& locker, bigtime_t timeout)
2385 {
2386 	ConditionVariableEntry entry;
2387 	condition.Add(&entry);
2388 
2389 	locker.Unlock();
2390 	status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, timeout);
2391 	locker.Lock();
2392 
2393 	return result;
2394 }
2395 
2396 
2397 //	#pragma mark -
2398 
2399 
2400 void
2401 TCPEndpoint::Dump() const
2402 {
2403 	kprintf("TCP endpoint %p\n", this);
2404 	kprintf("  state: %s\n", name_for_state(fState));
2405 	kprintf("  flags: 0x%" B_PRIx32 "\n", fFlags);
2406 #if KDEBUG
2407 	kprintf("  lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder);
2408 #endif
2409 	kprintf("  accept sem: %" B_PRId32 "\n", fAcceptSemaphore);
2410 	kprintf("  options: 0x%" B_PRIx32 "\n", (uint32)fOptions);
2411 	kprintf("  send\n");
2412 	kprintf("    window shift: %" B_PRIu8 "\n", fSendWindowShift);
2413 	kprintf("    unacknowledged: %" B_PRIu32 "\n",
2414 		fSendUnacknowledged.Number());
2415 	kprintf("    next: %" B_PRIu32 "\n", fSendNext.Number());
2416 	kprintf("    max: %" B_PRIu32 "\n", fSendMax.Number());
2417 	kprintf("    urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number());
2418 	kprintf("    window: %" B_PRIu32 "\n", fSendWindow);
2419 	kprintf("    max window: %" B_PRIu32 "\n", fSendMaxWindow);
2420 	kprintf("    max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize);
2421 	kprintf("    queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", fSendQueue.Used(),
2422 		fSendQueue.Size());
2423 #if DEBUG_BUFFER_QUEUE
2424 	fSendQueue.Dump();
2425 #endif
2426 	kprintf("    last acknowledge sent: %" B_PRIu32 "\n",
2427 		fLastAcknowledgeSent.Number());
2428 	kprintf("    initial sequence: %" B_PRIu32 "\n",
2429 		fInitialSendSequence.Number());
2430 	kprintf("  receive\n");
2431 	kprintf("    window shift: %" B_PRIu8 "\n", fReceiveWindowShift);
2432 	kprintf("    next: %" B_PRIu32 "\n", fReceiveNext.Number());
2433 	kprintf("    max advertised: %" B_PRIu32 "\n",
2434 		fReceiveMaxAdvertised.Number());
2435 	kprintf("    window: %" B_PRIu32 "\n", fReceiveWindow);
2436 	kprintf("    max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize);
2437 	kprintf("    queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n",
2438 		fReceiveQueue.Available(), fReceiveQueue.Size());
2439 #if DEBUG_BUFFER_QUEUE
2440 	fReceiveQueue.Dump();
2441 #endif
2442 	kprintf("    initial sequence: %" B_PRIu32 "\n",
2443 		fInitialReceiveSequence.Number());
2444 	kprintf("    duplicate acknowledge count: %" B_PRIu32 "\n",
2445 		fDuplicateAcknowledgeCount);
2446 	kprintf("  round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n",
2447 		fRoundTripTime, fRoundTripDeviation);
2448 	kprintf("  retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout);
2449 	kprintf("  congestion window: %" B_PRIu32 "\n", fCongestionWindow);
2450 	kprintf("  slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold);
2451 }
2452 
2453